diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2024-05-04 08:32:32 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-04 08:32:32 +0300 |
commit | 92139b90af4841d7fd060b526bdd443b621770ff (patch) | |
tree | 9679c3de1b39970ca73b5bd988c63ddac0359ca6 /convert-hf-to-gguf.py | |
parent | a2ac89d6efb41b535778bfeaecaae8fe295b6ed3 (diff) |
tests : add test-tokenizer-0.sh + fix some tokenizers (#7036)
* tests : add test-tokenizer-0.sh
* unicode : add all unicode number ranges
* starcoder : fix pre-tokenizer
* tests : add test that fails with DeepSeek tokenizers
* falcon : fix regex
* unicode : regenerate unicode tables
* refact : add tokenizer model
* lint : fix
* tests : disable failing tests
ggml-ci
* refact : add tests files
ggml-ci
* convert : print -> logging
ggml-ci
* lint : fix
* unicode : digit -> number
* phi-3 : update
Diffstat (limited to 'convert-hf-to-gguf.py')
-rwxr-xr-x | convert-hf-to-gguf.py | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 52932628..88c16676 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -308,6 +308,9 @@ class Model(ABC): if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": # ref: https://huggingface.co/openai-community/gpt2 res = "gpt-2" + if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": + # ref: https://huggingface.co/smallcloudai/Refact-1_6-base + res = "refact" if res is None: logger.warning("\n") @@ -324,7 +327,7 @@ class Model(ABC): logger.warning("\n") raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") - logger.debug(f"tokenizer.ggml.pre: {res}") + logger.debug(f"tokenizer.ggml.pre: {repr(res)}") logger.debug(f"chkhsh: {chkhsh}") return res |