diff options
author | Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> | 2024-05-19 14:46:46 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-19 22:46:46 +1000 |
commit | 6aade19ee74b896c59929676629340b36be3e22c (patch) | |
tree | 3c9a5a15ada2e5f87c801a27552598e31d7f8add /llama.cpp | |
parent | ab33f7a338593f6cf1ae98b10b6f8684f63bd72c (diff) |
Add StableLM2 pre-tokenizer (#7349)
* Add StableLM pre-tokenizer
* Fix space
* Fix trailing whitespace
Diffstat (limited to 'llama.cpp')
-rw-r--r-- | llama.cpp | 4 |
1 files changed, 4 insertions, 0 deletions
@@ -4464,6 +4464,9 @@ static void llm_load_vocab( tokenizer_pre == "qwen2") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2; } else if ( + tokenizer_pre == "stablelm2") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2; + } else if ( tokenizer_pre == "olmo") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO; } else if ( @@ -12363,6 +12366,7 @@ struct llm_tokenizer_bpe { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", }); break; + case LLAMA_VOCAB_PRE_TYPE_STABLELM2: case LLAMA_VOCAB_PRE_TYPE_QWEN2: word_collection = unicode_regex_split(text, { // original regex from tokenizer.json |