From 6f3a3ba7e249cd689cb1ab0376e6504fb6cd49e7 Mon Sep 17 00:00:00 2001 From: Fizz~ <168598314+fizzAI@users.noreply.github.com> Date: Sun, 6 Jul 2025 06:13:55 -0400 Subject: Special handling of Seed Coder FIM tokens (#585) * Special handling of Seed Coder FIM tokens * vocab: Add Seed Coder pretokenizer * Formatting fix * Update llama.h --- src/llama.cpp | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'src/llama.cpp') diff --git a/src/llama.cpp b/src/llama.cpp index 3ee95939..564304f6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6302,6 +6302,10 @@ static void llm_load_vocab( tokenizer_pre == "bailingmoe") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE; vocab.tokenizer_clean_spaces = false; + } else if ( + tokenizer_pre == "seed-coder") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SEED_CODER; + vocab.tokenizer_clean_spaces = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } -- cgit v1.2.3