From 6f3a3ba7e249cd689cb1ab0376e6504fb6cd49e7 Mon Sep 17 00:00:00 2001
From: Fizz~ <168598314+fizzAI@users.noreply.github.com>
Date: Sun, 6 Jul 2025 06:13:55 -0400
Subject: Special handling of Seed Coder FIM tokens (#585)

* Special handling of Seed Coder FIM tokens

* vocab: Add Seed Coder pretokenizer

* Formatting fix

* Update llama.h
---
 src/llama.cpp | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'src/llama.cpp')

diff --git a/src/llama.cpp b/src/llama.cpp
index 3ee95939..564304f6 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6302,6 +6302,10 @@ static void llm_load_vocab(
                 tokenizer_pre == "bailingmoe") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
                 vocab.tokenizer_clean_spaces = false;
+            } else if (
+                tokenizer_pre == "seed-coder") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
+                vocab.tokenizer_clean_spaces = false;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
-- 
cgit v1.2.3