summaryrefslogtreecommitdiff
path: root/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
diff options
context:
space:
mode:
authorKawrakow <48489457+ikawrakow@users.noreply.github.com>2024-07-27 07:55:01 +0200
committerGitHub <noreply@github.com>2024-07-27 07:55:01 +0200
commit154e0d75fccf1784fe9ff6fd76a630b66563da3d (patch)
tree81ce6dbb5b1900c1aa78a879f0593c694cab9d27 /examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
parent0684c3e9c70d49323b4fc517128cbe222cab7f96 (diff)
Merge mainline llama.cpp (#3)
* Merging mainline - WIP * Merging mainline - WIP AVX2 and CUDA appear to work. CUDA performance seems slightly (~1-2%) lower as it is so often the case with llama.cpp/ggml after some "improvements" have been made. * Merging mainline - fix Metal * Remove check --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'examples/llama.swiftui/llama.cpp.swift/LibLlama.swift')
-rw-r--r--examples/llama.swiftui/llama.cpp.swift/LibLlama.swift8
1 files changed, 5 insertions, 3 deletions
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
index 737f882f..58c32ca5 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -26,11 +26,12 @@ actor LlamaContext {
private var context: OpaquePointer
private var batch: llama_batch
private var tokens_list: [llama_token]
+ var is_done: Bool = false
/// This variable is used to store temporarily invalid cchars
private var temporary_invalid_cchars: [CChar]
- var n_len: Int32 = 64
+ var n_len: Int32 = 1024
var n_cur: Int32 = 0
var n_decode: Int32 = 0
@@ -160,6 +161,7 @@ actor LlamaContext {
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
print("\n")
+ is_done = true
let new_token_str = String(cString: temporary_invalid_cchars + [0])
temporary_invalid_cchars.removeAll()
return new_token_str
@@ -322,7 +324,7 @@ actor LlamaContext {
defer {
result.deallocate()
}
- let nTokens = llama_token_to_piece(model, token, result, 8, false)
+ let nTokens = llama_token_to_piece(model, token, result, 8, 0, false)
if nTokens < 0 {
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@@ -330,7 +332,7 @@ actor LlamaContext {
defer {
newResult.deallocate()
}
- let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
+ let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false)
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
return Array(bufferPointer)
} else {