summaryrefslogtreecommitdiff
path: root/examples/benchmark/benchmark-matmult.cpp
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-11-13 14:16:23 +0200
committerGitHub <noreply@github.com>2023-11-13 14:16:23 +0200
commit4760e7cc0b68570d58f55e8dda469805d1759d0d (patch)
treecd983b1f2833f0094c0539f7943703c6787bf12b /examples/benchmark/benchmark-matmult.cpp
parentbb50a792ec2a49944470c82694fa364345e95170 (diff)
sync : ggml (backend v2) (#3912)
* sync : ggml (backend v2) (wip) * sync : migrate examples and llama.cpp to dynamic graphs (wip) * sync : update tests + fix max op params to 64 ggml-ci * sync : ggml-cuda ggml-ci * llama : fix save/load state context size ggml-ci * sync : try to fix build on tvOS * sync : pass custom graph sizes in training examples * sync : update graph copies to new ggml API * sync : update sync-ggml.sh with new files * scripts : fix header in sync script * train : fix context size calculations * llama : increase inference graph size up to 4096 nodes * train : allocate grads for backward graphs * train : allocate grads for gb_tmp
Diffstat (limited to 'examples/benchmark/benchmark-matmult.cpp')
-rw-r--r--examples/benchmark/benchmark-matmult.cpp21
1 files changed, 12 insertions, 9 deletions
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 76e3f57c..284733b1 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -171,7 +171,8 @@ int main(int argc, char ** argv) {
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
// printf("Creating compute graph\n");
- struct ggml_cgraph gf = ggml_build_forward(m11xm2);
+ struct ggml_cgraph * gf = ggml_new_graph(ctx);
+ ggml_build_forward_expand(gf, m11xm2);
printf("n_threads=%i\n", benchmark_params.n_threads);
@@ -180,9 +181,9 @@ int main(int argc, char ** argv) {
std::vector<uint8_t> work_buffer;
- ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);
+ ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
- TENSOR_DUMP(gf.nodes[0]);
+ TENSOR_DUMP(gf->nodes[0]);
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
@@ -200,7 +201,8 @@ int main(int argc, char ** argv) {
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
// printf("Creating compute graph\n");
- struct ggml_cgraph gf31 = ggml_build_forward(q31);
+ struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
+ ggml_build_forward_expand(gf31, q31);
// Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n");
@@ -211,7 +213,8 @@ int main(int argc, char ** argv) {
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
//printf("Creating compute graph\n");
- struct ggml_cgraph gf32 = ggml_build_forward(q32);
+ struct ggml_cgraph * gf32 = ggml_new_graph(ctx);
+ ggml_build_forward_expand(gf32, q32);
printf("n_threads=%i\n", benchmark_params.n_threads);
const int dimx = sizex;
@@ -223,7 +226,7 @@ int main(int argc, char ** argv) {
// Let's use the F32 result from above as a reference for the quantized multiplication
- float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
+ float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
printf("=====================================================================================\n");
@@ -233,7 +236,7 @@ int main(int argc, char ** argv) {
long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n");
- ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);
+ ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
long long int stop = ggml_time_us();
long long int usec = stop-start;
@@ -251,7 +254,7 @@ int main(int argc, char ** argv) {
// Check that the matrix multiplication result is in the right ballpark
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
- float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
+ float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
@@ -266,7 +269,7 @@ int main(int argc, char ** argv) {
}
// Running a different graph computation to make sure we override the CPU cache lines
- ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
+ ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
}
printf("\n");
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));