diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2024-05-28 13:55:35 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-28 13:55:35 +0300 |
commit | 8b99e2aa66ba39e4e1114effea6ef7430881eca4 (patch) | |
tree | 4c036c5c457803fd7ecf896337f0e37dd4001714 | |
parent | 271ff3fc44a6ecfcea3ebc192e67567d578b7772 (diff) |
llama : handle unknown utf8 bytes (#7588)
-rw-r--r-- | llama.cpp | 11 |
1 files changed, 10 insertions, 1 deletions
@@ -17940,7 +17940,16 @@ static std::string llama_decode_text(const std::string & text) { const auto cpts = unicode_cpts_from_utf8(text); for (const auto cpt : cpts) { - decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt)); + const auto utf8 = unicode_cpt_to_utf8(cpt); + try { + decoded_text += unicode_utf8_to_byte(utf8); + } catch (const std::out_of_range & e) { + decoded_text += "[UNK_BYTE_0x"; + for (const auto c : utf8) { + decoded_text += format("%02x", (uint8_t) c); + } + decoded_text += text + "]"; + } } return decoded_text; |