summaryrefslogtreecommitdiff
path: root/ggml.c
diff options
context:
space:
mode:
Diffstat (limited to 'ggml.c')
-rw-r--r--ggml.c327
1 files changed, 302 insertions, 25 deletions
diff --git a/ggml.c b/ggml.c
index 995a2faa..acdba033 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1420,6 +1420,34 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
#endif
}
+static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+
+static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
+ [GGML_TYPE_Q4_0] = {
+ .dequantize_row_q = dequantize_row_q4_0,
+ .quantize_row_q = quantize_row_q4_0,
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
+ .quantize_row_q_dot = quantize_row_q8_0,
+ .vec_dot_q = ggml_vec_dot_q4_0_q8_0,
+ },
+ [GGML_TYPE_Q4_1] = {
+ .dequantize_row_q = dequantize_row_q4_1,
+ .quantize_row_q = quantize_row_q4_1,
+ .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
+ .quantize_row_q_dot = quantize_row_q4_1,
+ .vec_dot_q = ggml_vec_dot_q4_1,
+ },
+ // TODO: GGML_TYPE_Q8_0
+};
+
+// For internal test use
+quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
+ GGML_ASSERT(i < GGML_TYPE_COUNT);
+ return quantize_fns[i];
+}
+
+
//
// simd mappings
//
@@ -5588,6 +5616,26 @@ static void ggml_compute_forward_dup_f16(
}
}
}
+ } else if (dst->type == GGML_TYPE_Q4_0 || dst->type == GGML_TYPE_Q4_1) {
+ quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
+ size_t id = 0;
+ uint8_t * dst_ptr = (uint8_t *) dst->data;
+ size_t dst_row_size = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
+ float * src0_f32 = (float *) params->wdata;
+
+ for (int i03 = 0; i03 < ne03; i03++) {
+ for (int i02 = 0; i02 < ne02; i02++) {
+ for (int i01 = 0; i01 < ne01; i01++) {
+ const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+ // convert to f32 and quantize
+ for (int i00 = 0; i00 < ne00; i00++) {
+ src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]);
+ }
+ quantize_row_q(src0_f32, dst_ptr + id, ne00);
+ id += dst_row_size;
+ }
+ }
+ }
} else {
GGML_ASSERT(false); // TODO: implement
}
@@ -5780,6 +5828,21 @@ static void ggml_compute_forward_dup_f32(
}
}
}
+ } else if (dst->type == GGML_TYPE_Q4_0 || dst->type == GGML_TYPE_Q4_1) {
+ quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
+ size_t id = 0;
+ uint8_t * dst_ptr = (uint8_t *) dst->data;
+ size_t dst_row_size = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
+
+ for (int i03 = 0; i03 < ne03; i03++) {
+ for (int i02 = 0; i02 < ne02; i02++) {
+ for (int i01 = 0; i01 < ne01; i01++) {
+ const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+ quantize_row_q(src0_ptr, dst_ptr + id, ne00);
+ id += dst_row_size;
+ }
+ }
+ }
} else {
GGML_ASSERT(false); // TODO: implement
}
@@ -5968,6 +6031,212 @@ static void ggml_compute_forward_add_f32(
}
}
+static void ggml_compute_forward_add_f16_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int n = ggml_nrows(src0);
+ const int nc = src0->ne[0];
+
+ const size_t nb00 = src0->nb[0];
+ const size_t nb01 = src0->nb[1];
+
+ const size_t nb10 = src1->nb[0];
+ const size_t nb11 = src1->nb[1];
+
+ const size_t nb0 = dst->nb[0];
+ const size_t nb1 = dst->nb[1];
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
+
+ GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+ if (nb10 == sizeof(float)) {
+ for (int j = ith; j < n; j += nth) {
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1);
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
+ for (int i = 0; i < nc; i++) {
+ float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
+ dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr);
+ }
+ }
+ }
+ else {
+ // src1 is not contiguous
+ GGML_ASSERT(false);
+ }
+}
+
+static void ggml_compute_forward_add_f16_f16(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int n = ggml_nrows(src0);
+ const int nc = src0->ne[0];
+
+ const size_t nb00 = src0->nb[0];
+ const size_t nb01 = src0->nb[1];
+
+ const size_t nb10 = src1->nb[0];
+ const size_t nb11 = src1->nb[1];
+
+ const size_t nb0 = dst->nb[0];
+ const size_t nb1 = dst->nb[1];
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
+ GGML_ASSERT(src1->type == GGML_TYPE_F16);
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
+
+ GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+ if (nb10 == sizeof(ggml_fp16_t)) {
+ for (int j = ith; j < n; j += nth) {
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1);
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
+ for (int i = 0; i < nc; i++) {
+ ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + j*nb11 + i*nb10);
+ dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(*src1_ptr));
+ }
+ }
+ }
+ else {
+ // src1 is not contiguous
+ GGML_ASSERT(false);
+ }
+}
+
+static void ggml_compute_forward_add_q_f32(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * src1,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ const int64_t ne00 = src0->ne[0];
+ const int64_t ne01 = src0->ne[1];
+ const int64_t ne02 = src0->ne[2];
+ const int64_t ne03 = src0->ne[3];
+
+ //const int64_t ne10 = src1->ne[0];
+ //const int64_t ne11 = src1->ne[1];
+ const int64_t ne12 = src1->ne[2];
+ const int64_t ne13 = src1->ne[3];
+
+ //const int64_t ne0 = dst->ne[0];
+ //const int64_t ne1 = dst->ne[1];
+ const int64_t ne2 = dst->ne[2];
+ const int64_t ne3 = dst->ne[3];
+
+ const int nb00 = src0->nb[0];
+ const int nb01 = src0->nb[1];
+ const int nb02 = src0->nb[2];
+ const int nb03 = src0->nb[3];
+
+ const int nb10 = src1->nb[0];
+ const int nb11 = src1->nb[1];
+ const int nb12 = src1->nb[2];
+ const int nb13 = src1->nb[3];
+
+ const int nb0 = dst->nb[0];
+ const int nb1 = dst->nb[1];
+ const int nb2 = dst->nb[2];
+ const int nb3 = dst->nb[3];
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ GGML_ASSERT(ne02 == ne12);
+ GGML_ASSERT(ne03 == ne13);
+ GGML_ASSERT(ne2 == ne12);
+ GGML_ASSERT(ne3 == ne13);
+
+ const enum ggml_type type = src0->type;
+ dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
+ quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
+
+ // we don't support permuted src0 or src1
+ GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
+ GGML_ASSERT(nb10 == sizeof(float));
+
+ // dst cannot be transposed or permuted
+ GGML_ASSERT(nb0 <= nb1);
+ GGML_ASSERT(nb1 <= nb2);
+ GGML_ASSERT(nb2 <= nb3);
+
+ GGML_ASSERT(src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1);
+ GGML_ASSERT(dst->type == src0->type);
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+ // total rows in src0
+ const int nr = ne01*ne02*ne03;
+
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ float * wdata = (float*) params->wdata + ne00 * ith;
+
+ for (int ir = ir0; ir < ir1; ++ir) {
+ // src0 indices
+ const int i03 = ir/(ne02*ne01);
+ const int i02 = (ir - i03*ne02*ne01)/ne01;
+ const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+ // src1 and dst are same shape as src0 => same indices
+ const int i13 = i03;
+ const int i12 = i02;
+ const int i11 = i01;
+
+ const int i3 = i03;
+ const int i2 = i02;
+ const int i1 = i01;
+
+ void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
+ float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
+ void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0));
+
+ assert(ne00 % 32 == 0);
+
+ // unquantize row from src0 to temp buffer
+ dequantize_row_q(src0_row, wdata, ne00);
+ // add src1
+ ggml_vec_acc_f32(ne00, wdata, src1_row);
+ // quantize row to dst
+ quantize_row_q(wdata, dst_row, ne00);
+ }
+}
+
static void ggml_compute_forward_add(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
@@ -5978,6 +6247,23 @@ static void ggml_compute_forward_add(
{
ggml_compute_forward_add_f32(params, src0, src1, dst);
} break;
+ case GGML_TYPE_F16:
+ {
+ if (src1->type == GGML_TYPE_F16) {
+ ggml_compute_forward_add_f16_f16(params, src0, src1, dst);
+ }
+ else if (src1->type == GGML_TYPE_F32) {
+ ggml_compute_forward_add_f16_f32(params, src0, src1, dst);
+ }
+ else {
+ GGML_ASSERT(false);
+ }
+ } break;
+ case GGML_TYPE_Q4_0:
+ case GGML_TYPE_Q4_1:
+ {
+ ggml_compute_forward_add_q_f32(params, src0, src1, dst);
+ } break;
default:
{
GGML_ASSERT(false);
@@ -7257,30 +7543,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
//}
}
-static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
- [GGML_TYPE_Q4_0] = {
- .dequantize_row_q = dequantize_row_q4_0,
- .quantize_row_q = quantize_row_q4_0,
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
- .quantize_row_q_dot = quantize_row_q8_0,
- .vec_dot_q = ggml_vec_dot_q4_0_q8_0,
- },
- [GGML_TYPE_Q4_1] = {
- .dequantize_row_q = dequantize_row_q4_1,
- .quantize_row_q = quantize_row_q4_1,
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
- .quantize_row_q_dot = quantize_row_q4_1,
- .vec_dot_q = ggml_vec_dot_q4_1,
- },
- // TODO: GGML_TYPE_Q8_0
-};
-
-// For internal test use
-quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
- GGML_ASSERT(i < GGML_TYPE_COUNT);
- return quantize_fns[i];
-}
-
static void ggml_compute_forward_mul_mat_q_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
@@ -10137,13 +10399,29 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
struct ggml_tensor * node = cgraph->nodes[i];
switch (node->op) {
+ case GGML_OP_CPY:
case GGML_OP_DUP:
{
node->n_tasks = 1;
+
+ size_t cur = 0;
+ if (node->type == GGML_TYPE_Q4_0 || node->type == GGML_TYPE_Q4_1) {
+ cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0];
+ }
+
+ work_size = MAX(work_size, cur);
} break;
case GGML_OP_ADD:
{
node->n_tasks = n_threads;
+
+ size_t cur = 0;
+
+ if (node->src0->type == GGML_TYPE_Q4_0 || node->src0->type == GGML_TYPE_Q4_1) {
+ cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads;
+ }
+
+ work_size = MAX(work_size, cur);
} break;
case GGML_OP_SUB:
case GGML_OP_MUL:
@@ -10224,7 +10502,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
{
node->n_tasks = n_threads;
} break;
- case GGML_OP_CPY:
case GGML_OP_CONT:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW: