summaryrefslogtreecommitdiff
path: root/src/llama-vocab.cpp
diff options
context:
space:
mode:
authorAleksey Nikiforov <lexn82@gmail.com>2025-07-14 12:43:52 -0400
committerGitHub <noreply@github.com>2025-07-14 18:43:52 +0200
commitf5353047ef461e6fc9d527e09a06c9802c699929 (patch)
tree206c8c56efd3dcac1e39655e73788affe6c02832 /src/llama-vocab.cpp
parent255c22046bcaef41850125be924f3e42e2a65571 (diff)
Ported kimi-k2 support from llama.cpp (#609)
Original patch by @gabriellarson: https://github.com/ggml-org/llama.cpp/pull/14654 Co-authored-by: anikifoss <anikifoss>
Diffstat (limited to 'src/llama-vocab.cpp')
-rw-r--r--src/llama-vocab.cpp7
1 files changed, 7 insertions, 0 deletions
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 7bae4fec..109a6659 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -464,6 +464,13 @@ struct llm_tokenizer_bpe {
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
+ case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
+ regex_exprs = {
+ // K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
+ // The custom handler implements all K2 patterns with proper Han character exclusion
+ "\\p{Han}+",
+ };
+ break;
case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
regex_exprs = {
"\\p{N}+",