summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2024-05-28 13:55:35 +0300
committerGitHub <noreply@github.com>2024-05-28 13:55:35 +0300
commit8b99e2aa66ba39e4e1114effea6ef7430881eca4 (patch)
tree4c036c5c457803fd7ecf896337f0e37dd4001714
parent271ff3fc44a6ecfcea3ebc192e67567d578b7772 (diff)
llama : handle unknown utf8 bytes (#7588)
-rw-r--r--llama.cpp11
1 files changed, 10 insertions, 1 deletions
diff --git a/llama.cpp b/llama.cpp
index f67cb7e2..aa493532 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -17940,7 +17940,16 @@ static std::string llama_decode_text(const std::string & text) {
const auto cpts = unicode_cpts_from_utf8(text);
for (const auto cpt : cpts) {
- decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
+ const auto utf8 = unicode_cpt_to_utf8(cpt);
+ try {
+ decoded_text += unicode_utf8_to_byte(utf8);
+ } catch (const std::out_of_range & e) {
+ decoded_text += "[UNK_BYTE_0x";
+ for (const auto c : utf8) {
+ decoded_text += format("%02x", (uint8_t) c);
+ }
+ decoded_text += text + "]";
+ }
}
return decoded_text;