Merge mainline llama.cpp (#3)

* Merging mainline - WIP * Merging mainline - WIP AVX2 and CUDA appear to work. CUDA performance seems slightly (~1-2%) lower as it is so often the case with llama.cpp/ggml after some "improvements" have been made. * Merging mainline - fix Metal * Remove check --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
author: Kawrakow <48489457+ikawrakow@users.noreply.github.com> 2024-07-27 07:55:01 +0200
committer: GitHub <noreply@github.com> 2024-07-27 07:55:01 +0200
commit: 154e0d75fccf1784fe9ff6fd76a630b66563da3d (patch)
tree: 81ce6dbb5b1900c1aa78a879f0593c694cab9d27 /examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
parent: 0684c3e9c70d49323b4fc517128cbe222cab7f96 (diff)
1 files changed, 5 insertions, 3 deletions
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
index 737f882f..58c32ca5 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -26,11 +26,12 @@ actor LlamaContext {
     private var context: OpaquePointer
     private var batch: llama_batch
     private var tokens_list: [llama_token]
+    var is_done: Bool = false
 
     /// This variable is used to store temporarily invalid cchars
     private var temporary_invalid_cchars: [CChar]
 
-    var n_len: Int32 = 64
+    var n_len: Int32 = 1024
     var n_cur: Int32 = 0
 
     var n_decode: Int32 = 0
@@ -160,6 +161,7 @@ actor LlamaContext {
 
         if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
             print("\n")
+            is_done = true
             let new_token_str = String(cString: temporary_invalid_cchars + [0])
             temporary_invalid_cchars.removeAll()
             return new_token_str
@@ -322,7 +324,7 @@ actor LlamaContext {
         defer {
             result.deallocate()
         }
-        let nTokens = llama_token_to_piece(model, token, result, 8, false)
+        let nTokens = llama_token_to_piece(model, token, result, 8, 0, false)
 
         if nTokens < 0 {
             let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@@ -330,7 +332,7 @@ actor LlamaContext {
             defer {
                 newResult.deallocate()
             }
-            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
+            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false)
             let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
             return Array(bufferPointer)
         } else {
author	Kawrakow <48489457+ikawrakow@users.noreply.github.com>	2024-07-27 07:55:01 +0200
committer	GitHub <noreply@github.com>	2024-07-27 07:55:01 +0200
commit	154e0d75fccf1784fe9ff6fd76a630b66563da3d (patch)
tree	81ce6dbb5b1900c1aa78a879f0593c694cab9d27 /examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
parent	0684c3e9c70d49323b4fc517128cbe222cab7f96 (diff)