diff options
author | opparco <parco.opaai@gmail.com> | 2023-09-03 19:18:09 +0900 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-09-03 13:18:09 +0300 |
commit | 37301347767d555d0a66c043ce4ef6ead8e61c55 (patch) | |
tree | b567150655402795fcc2110a538ccd6cb8615e7f | |
parent | d9151e6f570eb20bfd54427bd8a337d9b1a08018 (diff) |
llama : fix bpe tokenize from byte (#2889)
-rw-r--r-- | llama.cpp | 10 |
1 files changed, 8 insertions, 2 deletions
@@ -3366,9 +3366,15 @@ struct llm_tokenizer_bpe { std::string byte_str(1, *j); auto token_multibyte = vocab.token_to_id.find(byte_str); if (token_multibyte == vocab.token_to_id.end()) { - fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str()); + try { + llama_token token_byte = llama_byte_to_token(vocab, *j); + output.push_back(token_byte); + } catch (const std::out_of_range & err) { + fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str()); + } + } else { + output.push_back((*token_multibyte).second); } - output.push_back((*token_multibyte).second); } } else { output.push_back((*token).second); |