summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoropparco <parco.opaai@gmail.com>2023-09-03 19:18:09 +0900
committerGitHub <noreply@github.com>2023-09-03 13:18:09 +0300
commit37301347767d555d0a66c043ce4ef6ead8e61c55 (patch)
treeb567150655402795fcc2110a538ccd6cb8615e7f
parentd9151e6f570eb20bfd54427bd8a337d9b1a08018 (diff)
llama : fix bpe tokenize from byte (#2889)
-rw-r--r--llama.cpp10
1 files changed, 8 insertions, 2 deletions
diff --git a/llama.cpp b/llama.cpp
index 2b0cf30f..c97c1462 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3366,9 +3366,15 @@ struct llm_tokenizer_bpe {
std::string byte_str(1, *j);
auto token_multibyte = vocab.token_to_id.find(byte_str);
if (token_multibyte == vocab.token_to_id.end()) {
- fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
+ try {
+ llama_token token_byte = llama_byte_to_token(vocab, *j);
+ output.push_back(token_byte);
+ } catch (const std::out_of_range & err) {
+ fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
+ }
+ } else {
+ output.push_back((*token_multibyte).second);
}
- output.push_back((*token_multibyte).second);
}
} else {
output.push_back((*token).second);