llama : fix command-r inference when omitting outputs (#6367)

author: compilade <113953597+compilade@users.noreply.github.com> 2024-03-28 08:05:54 -0400
committer: GitHub <noreply@github.com> 2024-03-28 14:05:54 +0200
commit: 0308f5e3d7bf9879f818b1a4ae589ff36b242af5 (patch)
tree: c68ce4fbaa405c88179a4d7667a5dd0e38493fad
parent: 28cb9a09c4d10a489be1238abe7a858dcd4d65f2 (diff)
1 files changed, 3 insertions, 2 deletions
diff --git a/llama.cpp b/llama.cpp
index 892d46fb..77ec9b7a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9152,8 +9152,9 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+                cur     = ggml_get_rows(ctx0,     cur, inp_out_ids);
+                inpL    = ggml_get_rows(ctx0,    inpL, inp_out_ids);
+                ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
             }
 
             struct ggml_tensor * attn_out = cur;
author	compilade <113953597+compilade@users.noreply.github.com>	2024-03-28 08:05:54 -0400
committer	GitHub <noreply@github.com>	2024-03-28 14:05:54 +0200
commit	0308f5e3d7bf9879f818b1a4ae589ff36b242af5 (patch)
tree	c68ce4fbaa405c88179a4d7667a5dd0e38493fad
parent	28cb9a09c4d10a489be1238abe7a858dcd4d65f2 (diff)