Improve handling of special tokens in GGML to GGUF converter (#2725)

* Improve UNK, BOS, EOS token handling when converting without metadata. * Allow importing as a module. * Remove some obsolete code and minor cleanups. * Set default UNK token mapping from -1 to 0 in llama.cpp * Try to handle overflow due to buggy Windows Python with a better error message
author: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> 2023-08-22 17:39:39 -0600
committer: GitHub <noreply@github.com> 2023-08-22 17:39:39 -0600
commit: 777f42ba18b29f25c71ff8de3ecf97b8017304c0 (patch)
tree: c4622646a366bd1f302293cb8aa7b0420d18b17e /llama.cpp
parent: 46ef5b5fcf4c366e1fb27726b6394adbbf8fd0ea (diff)
1 files changed, 1 insertions, 1 deletions
diff --git a/llama.cpp b/llama.cpp
index 6c5da130..fd8eaa18 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -703,7 +703,7 @@ struct llama_vocab {
     // default LLaMA special tokens
     id special_bos_id = 1;
     id special_eos_id = 2;
-    id special_unk_id = -1;
+    id special_unk_id = 0;
     id special_sep_id = -1;
     id special_pad_id = -1;
author	Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>	2023-08-22 17:39:39 -0600
committer	GitHub <noreply@github.com>	2023-08-22 17:39:39 -0600
commit	777f42ba18b29f25c71ff8de3ecf97b8017304c0 (patch)
tree	c4622646a366bd1f302293cb8aa7b0420d18b17e /llama.cpp
parent	46ef5b5fcf4c366e1fb27726b6394adbbf8fd0ea (diff)