From cf658adc832badaaa2ca119fe86070e5a830f8f6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 23 Aug 2023 23:08:04 +0300 Subject: llm : add Falcon support (#2717) * llama : refactor GGUF constants into static maps * llama : check if model architecture is known * llama : refactor llama_model_load_internal() * gguf : add KV constant maps * llm : read arch-specific KVs * convert : add dummy scores + types * falcon : load tensor data (CPU only) * llama : fix loading progress bar * llama : add arch member to llama_model * falcon : CPU inference working * falcon : support non-40B models * falcon : minor * llama : minor updates ggml-ci * convert-falcon-hf-to-gguf.py : fix special token mapping * llama.cpp : llama default UNK token = id 0 * llama.cpp : fix bpe tokenizer * llama.cpp : fix the fix of bpe tokenizer * ggml : pass eps to ggml_norm * metal : implement RoPE (mode = 2) + avoid ggml_repeat * ggml : ggml_repeat always creates new tensor * falcon : copy-paste self-attention from LLaMA * metal : print extra compute pipeline info * falcon : minor changes (still chasing the Metal problem) * llama.cpp : fix linefeed token * metal : fix GELU kernel numerical stability by using precise::tanh * metal : temporary workaround for the concurrency optimization bug * falcon : add CUDA offloading (#2739) * llama : better model naming and size reporting * llama : prep new tokenizer support * llama : advanced BPE tokenizer based on ggllm.cpp imlpementation * llama : remove oboslete comment ggml-ci * common : remove obsolete BPE API + disable test-tokenizer-1 * llama : revert BPE special-case in llama_byte_to_token() * cuda : add TODOs for RoPE NeoX implementation * llama : default special tokens based on vocab type * perplexity : add log for start of tokenization --------- Co-authored-by: klosax <131523366+klosax@users.noreply.github.com> Co-authored-by: slaren --- examples/main/main.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'examples/main') diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 0a22f3c2..1393f0b0 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -43,7 +43,7 @@ static bool is_interacting = false; void sigint_handler(int signo) { if (signo == SIGINT) { if (!is_interacting) { - is_interacting=true; + is_interacting = true; } else { console::cleanup(); printf("\n"); @@ -189,10 +189,12 @@ int main(int argc, char ** argv) { } } + const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; + // tokenize the prompt std::vector embd_inp; if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { - embd_inp = ::llama_tokenize(ctx, params.prompt, true); + embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm); } else { embd_inp = session_tokens; } @@ -208,9 +210,9 @@ int main(int argc, char ** argv) { int original_prompt_len = 0; if (ctx_guidance) { params.cfg_negative_prompt.insert(0, 1, ' '); - guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true); + guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm); - std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true); + std::vector original_inp = ::llama_tokenize(ctx, params.prompt, is_spm); original_prompt_len = original_inp.size(); guidance_offset = (int)guidance_inp.size() - original_prompt_len; } @@ -257,8 +259,8 @@ int main(int argc, char ** argv) { } // prefix & suffix for instruct mode - const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true); - const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); + const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", is_spm); + const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); // in instruct mode, we inject a prefix and a suffix to each input by the user if (params.instruct) { -- cgit v1.2.3