summaryrefslogtreecommitdiff
path: root/convert.py
diff options
context:
space:
mode:
authorPedro Cuenca <pedro@huggingface.co>2024-04-21 13:50:41 +0200
committerGitHub <noreply@github.com>2024-04-21 14:50:41 +0300
commitb97bc3966e852adb626c90be64fd48282800f504 (patch)
tree178656d15821205889fa03ec603c7327facbb265 /convert.py
parentb8109bc0139f15a5b321909f47510b89dca47ffc (diff)
llama : support Llama 3 HF conversion (#6745)
* Support Llama 3 conversion The tokenizer is BPE. * style * Accept suggestion Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> * llama : add llama_token_is_eog() ggml-ci * llama : auto-detect more EOT tokens when missing in KV data * convert : replacing EOS token is a hack * llama : fix codegemma EOT token + add TODOs * llama : fix model type string for 8B model --------- Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'convert.py')
-rwxr-xr-xconvert.py9
1 files changed, 8 insertions, 1 deletions
diff --git a/convert.py b/convert.py
index 24df0a4d..1c700cf6 100755
--- a/convert.py
+++ b/convert.py
@@ -525,7 +525,14 @@ class LlamaHfVocab(Vocab):
# pre-check so we know if we need transformers
tokenizer_model: dict[str, Any] = tokenizer_json['model']
- if (
+ is_llama3 = (
+ tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
+ and not tokenizer_model.get('byte_fallback', True)
+ )
+ if is_llama3:
+ raise TypeError('Llama 3 must be converted with BpeVocab')
+
+ if not is_llama3 and (
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'Sequence'
):