From 154e0d75fccf1784fe9ff6fd76a630b66563da3d Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Sat, 27 Jul 2024 07:55:01 +0200 Subject: Merge mainline llama.cpp (#3) * Merging mainline - WIP * Merging mainline - WIP AVX2 and CUDA appear to work. CUDA performance seems slightly (~1-2%) lower as it is so often the case with llama.cpp/ggml after some "improvements" have been made. * Merging mainline - fix Metal * Remove check --------- Co-authored-by: Iwan Kawrakow --- examples/llama.swiftui/llama.cpp.swift/LibLlama.swift | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'examples/llama.swiftui/llama.cpp.swift/LibLlama.swift') diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 737f882f..58c32ca5 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -26,11 +26,12 @@ actor LlamaContext { private var context: OpaquePointer private var batch: llama_batch private var tokens_list: [llama_token] + var is_done: Bool = false /// This variable is used to store temporarily invalid cchars private var temporary_invalid_cchars: [CChar] - var n_len: Int32 = 64 + var n_len: Int32 = 1024 var n_cur: Int32 = 0 var n_decode: Int32 = 0 @@ -160,6 +161,7 @@ actor LlamaContext { if llama_token_is_eog(model, new_token_id) || n_cur == n_len { print("\n") + is_done = true let new_token_str = String(cString: temporary_invalid_cchars + [0]) temporary_invalid_cchars.removeAll() return new_token_str @@ -322,7 +324,7 @@ actor LlamaContext { defer { result.deallocate() } - let nTokens = llama_token_to_piece(model, token, result, 8, false) + let nTokens = llama_token_to_piece(model, token, result, 8, 0, false) if nTokens < 0 { let newResult = UnsafeMutablePointer.allocate(capacity: Int(-nTokens)) @@ -330,7 +332,7 @@ actor LlamaContext { defer { newResult.deallocate() } - let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false) + let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false) let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens)) return Array(bufferPointer) } else { -- cgit v1.2.3