diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2023-06-04 23:34:30 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-06-04 23:34:30 +0300 |
commit | ecb217db4fcfa3880300ad08531a5fb6bb142d45 (patch) | |
tree | e7a1a1fee49036f2ee46b419fb032966b8e62222 /ggml.c | |
parent | dcb2ed48268e421baf25adc00d602dad0f415564 (diff) |
llama : Metal inference (#1642)
* mtl : export the LLaMA computation graph
* ci : disable temporary
* mtl : adapt the MNIST example as starter
* mtl : no need for mtl-export tool, add cli arg for main instead
* mtl : export just a small part of the graph for now to make it easier
* mtl : move MSL code into separate file for easy editing
* mtl : initial get_rows_q4_0 kernel
* mtl : confirmed get_rows_q4_0 is working correctly
* mtl : add rms_norm kernel + confirm working
* mtl : add mul kernel + confirm working
* mtl : initial mul_mat Q4 kernel (wrong results)
* mtl : mul_mat fixes (still wrong)
* mtl : another mul_mat Q4 (still does not work)
* mtl : working mul_mat q4
* ggml : fix handling of "view" ops in ggml_graph_import()
* mtl : add rope kernel
* mtl : add reshape and transpose handling
* ggml : store offset as opt arg for ggml_view_xd() operators
* mtl : add cpy kernel + handle view ops
* mtl : confirm f16 x f32 attention mul mat
* mtl : add scale kernel
* mtl : add diag_mask_inf kernel
* mtl : fix soft_max kernel
* ggml : update ggml_nbytes() to handle non-contiguous tensors
* mtl : verify V tensor contents
* mtl : add f32 -> f32 cpy kernel
* mtl : add silu kernel
* mtl : add non-broadcast mul kernel
* mtl : full GPU inference of the computation graph
* mtl : optimize rms_norm and soft_max kernels
* mtl : add f16 mat x f32 vec multiplication kernel
* mtl : fix bug in f16 x f32 mul mat + speed-up computation
* mtl : faster mul_mat_q4_0_f32 kernel
* mtl : fix kernel signature + roll inner loop
* mtl : more threads for rms_norm + better timing
* mtl : remove printfs from inner loop
* mtl : simplify implementation
* mtl : add save/load vocab to ggml file
* mtl : plug Metal inference into llama.cpp (very quick-n-dirty)
* mtl : make it work with main example
Lots of hacks but at least now it generates text
* mtl : preparing for merge
* mtl : clean-up ggml mtl interface + suport scratch / inplace
* mtl : remove temp / debug code
* metal : final refactoring and simplification
* Revert "ci : disable temporary"
This reverts commit 98c267fc77fe811082f672538fc91bcfc9072d63.
* metal : add comments
* metal : clean-up stuff, fix typos
* readme : add Metal instructions
* readme : add example for main
Diffstat (limited to 'ggml.c')
-rw-r--r-- | ggml.c | 151 |
1 files changed, 113 insertions, 38 deletions
@@ -3723,7 +3723,7 @@ int64_t ggml_nelements(const struct ggml_tensor * tensor) { return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; } -int ggml_nrows(const struct ggml_tensor * tensor) { +int64_t ggml_nrows(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; @@ -3732,7 +3732,14 @@ int ggml_nrows(const struct ggml_tensor * tensor) { size_t ggml_nbytes(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - return (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]; + // this should handle cases where the tensor is not contiguous in memory + // probaby just: + // + // return tensor->ne[3]*tensor->nb[3] + // + // is enough, but just in case, adding the second part + + return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]); } int ggml_blck_size(enum ggml_type type) { @@ -3814,11 +3821,11 @@ size_t ggml_tensor_overhead(void) { return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16; } -static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) { +bool ggml_is_transposed(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; } -static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) { +bool ggml_is_contiguous(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return @@ -5802,10 +5809,18 @@ struct ggml_tensor * ggml_view_1d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); + ggml_scratch_save(ctx); + + struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + memcpy(offs->data, &offset, 2*sizeof(int32_t)); + + ggml_scratch_load(ctx); + result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; + result->opt[0] = offs; if (is_node) { memcpy(result->padding, &offset, sizeof(offset)); @@ -5834,6 +5849,13 @@ struct ggml_tensor * ggml_view_2d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); + ggml_scratch_save(ctx); + + struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + memcpy(offs->data, &offset, 2*sizeof(int32_t)); + + ggml_scratch_load(ctx); + result->nb[1] = nb1; result->nb[2] = result->nb[1]*ne1; result->nb[3] = result->nb[2]; @@ -5842,6 +5864,7 @@ struct ggml_tensor * ggml_view_2d( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; + result->opt[0] = offs; if (is_node) { memcpy(result->padding, &offset, sizeof(offset)); @@ -5872,6 +5895,13 @@ struct ggml_tensor * ggml_view_3d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); + ggml_scratch_save(ctx); + + struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + memcpy(offs->data, &offset, 2*sizeof(int32_t)); + + ggml_scratch_load(ctx); + result->nb[1] = nb1; result->nb[2] = nb2; result->nb[3] = result->nb[2]*ne2; @@ -5880,6 +5910,7 @@ struct ggml_tensor * ggml_view_3d( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; + result->opt[0] = offs; if (is_node) { memcpy(result->padding, &offset, sizeof(offset)); @@ -5912,6 +5943,13 @@ struct ggml_tensor * ggml_view_4d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset); + ggml_scratch_save(ctx); + + struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + memcpy(offs->data, &offset, 2*sizeof(int32_t)); + + ggml_scratch_load(ctx); + result->nb[1] = nb1; result->nb[2] = nb2; result->nb[3] = nb3; @@ -5920,6 +5958,7 @@ struct ggml_tensor * ggml_view_4d( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; + result->opt[0] = offs; if (is_node) { memcpy(result->padding, &offset, sizeof(offset)); @@ -9252,7 +9291,7 @@ static void ggml_compute_forward_rms_norm_f32( sum += (ggml_float)(x[i00] * x[i00]); } - float mean = sum/ne00; + const float mean = sum/ne00; float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); @@ -11163,7 +11202,7 @@ static void ggml_compute_forward_rope_f32( theta *= theta_scale; const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); const float x0 = src[0]; const float x1 = src[1]; @@ -11184,7 +11223,7 @@ static void ggml_compute_forward_rope_f32( const int64_t i0 = ib*n_dims + ic/2; const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); const float x0 = src[0]; const float x1 = src[n_dims/2]; @@ -14588,7 +14627,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n", + fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %32s\n", ggml_type_name(tensor->type), ggml_op_name (tensor->op), tensor->n_dims, @@ -14602,7 +14641,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n", + fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %32s\n", arg, ggml_type_name(tensor->type), ggml_op_name (tensor->op), @@ -14615,8 +14654,8 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char } void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { - assert(cgraph->work == NULL); - assert(cgraph->work_size == 0); + //assert(cgraph->work == NULL); + //assert(cgraph->work_size == 0); uint64_t size_eval = 0; @@ -14837,7 +14876,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** // read file into data { FILE * fin = fopen(fname, "rb"); - if (!fin) { fprintf(stderr, "%s: failed to open %s\n", __func__, fname); return result; @@ -14977,6 +15015,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** op = *(const uint32_t *) ptr; ptr += sizeof(op); n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); + enum ggml_op eop = (enum ggml_op) op; + int64_t ne[GGML_MAX_DIMS]; size_t nb[GGML_MAX_DIMS]; @@ -14991,42 +15031,77 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** nb[j] = nb_cur; } - struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); - - tensor->op = (enum ggml_op) op; + uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used - uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); + const char * ptr_name = ptr; ptr += GGML_MAX_NAME; - memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; + const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t); - for (int j = 0; j < GGML_MAX_DIMS; ++j) { - tensor->nb[j] = nb[j]; - } + struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL }; // parse args - { - struct ggml_tensor ** args[2 + GGML_MAX_OPT] = { - &tensor->src0, - &tensor->src1, - }; + for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { + const int32_t arg_idx = ptr_arg_idx[j]; - for (int j = 0; j < GGML_MAX_OPT; ++j) { - args[2 + j] = &tensor->opt[j]; + if (arg_idx == -1) { + continue; } - for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { - const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx); + if (arg_idx < GGML_MAX_NODES) { + args[j] = result.leafs[arg_idx]; + } else { + args[j] = result.nodes[arg_idx - GGML_MAX_NODES]; + } + } - if (arg_idx == -1) { - continue; - } + // create the tensor + // "view" operations are handled differently + // TODO: handle inplace ops - currently a copy is always made - if (arg_idx < GGML_MAX_NODES) { - *args[j] = result.leafs[arg_idx]; - } else { - *args[j] = result.nodes[arg_idx - GGML_MAX_NODES]; - } - } + struct ggml_tensor * tensor = NULL; + + switch (eop) { + // TODO: implement other view ops + case GGML_OP_RESHAPE: + { + tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]); + } break; + case GGML_OP_VIEW: + { + tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0); + + uint64_t offs; + memcpy(&offs, args[2]->data, sizeof(offs)); + + tensor->data = ((char *) tensor->data) + offs; + } break; + case GGML_OP_TRANSPOSE: + { + tensor = ggml_transpose(*ctx_eval, args[0]); + } break; + case GGML_OP_PERMUTE: + { + tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0); + } break; + default: + { + tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + + tensor->op = eop; + } break; + } + + memcpy(tensor->name, ptr_name, GGML_MAX_NAME); + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + tensor->nb[j] = nb[j]; + } + + tensor->src0 = args[0]; + tensor->src1 = args[1]; + + for (int j = 0; j < GGML_MAX_OPT; ++j) { + tensor->opt[j] = args[2 + j]; } result.nodes[i] = tensor; |