From a5e7dbd6141128bfa3c40a19c2945a181df625d3 Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Sun, 22 Oct 2023 12:14:56 -0600 Subject: llama : validate special token ids are in range when loading GGUF model (#3635) * Add validation for special token ids to llama.cpp Small optimization for llama_byte_to_token SPM mode * Fix BPE newline check, only I could break something so simple * Killll meeeeee * Account for GGUF_KEY_KEY only setting when the key exists * Minor code cleanups. * Fix convert.py error msg when added tokens are out of range * Make gguf SpecialVocab vocab size-aware Update conversion scripts accordingly * Avoid a string copy Co-authored-by: Georgi Gerganov --------- Co-authored-by: Georgi Gerganov --- llama.cpp | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'llama.cpp') diff --git a/llama.cpp b/llama.cpp index 36534933..8d52eaf6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2238,15 +2238,35 @@ static void llm_load_vocab( if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); } else { - vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0]; + const std::vector ids = llama_tokenize_internal(vocab, "\u010A", false); + GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); + vocab.linefeed_id = ids[0]; } // special tokens - GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID)); - GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID)); - GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID)); - GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID)); - GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID)); + { + const std::vector> special_token_types = { + { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id }, + { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id }, + { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id }, + { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id }, + { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id }, + }; + for (const auto & it : special_token_types) { + const std::string & key = kv(std::get<0>(it)); + int32_t & id = std::get<1>(it), old_id = id; + + GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key); + // Must be >= -1 and < vocab size. Since the key is unsigned, -1 + // can only come from the default value, so there's no point in + // validating that. + if (size_t(id + 1) > vocab.id_to_token.size()) { + LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n", + __func__, key.c_str(), id, old_id); + id = old_id; + } + } + } // build special tokens cache { @@ -6103,11 +6123,10 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { } static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { + static const char * hex = "0123456789ABCDEF"; switch (llama_vocab_get_type(vocab)) { case LLAMA_VOCAB_TYPE_SPM: { - char buf[7]; - int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch); - GGML_ASSERT(0 <= result && result < 7); + const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; return vocab.token_to_id.at(buf); } case LLAMA_VOCAB_TYPE_BPE: { -- cgit v1.2.3