From b97bc3966e852adb626c90be64fd48282800f504 Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Sun, 21 Apr 2024 13:50:41 +0200 Subject: llama : support Llama 3 HF conversion (#6745) * Support Llama 3 conversion The tokenizer is BPE. * style * Accept suggestion Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> * llama : add llama_token_is_eog() ggml-ci * llama : auto-detect more EOT tokens when missing in KV data * convert : replacing EOS token is a hack * llama : fix codegemma EOT token + add TODOs * llama : fix model type string for 8B model --------- Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> Co-authored-by: Georgi Gerganov --- convert.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'convert.py') diff --git a/convert.py b/convert.py index 24df0a4d..1c700cf6 100755 --- a/convert.py +++ b/convert.py @@ -525,7 +525,14 @@ class LlamaHfVocab(Vocab): # pre-check so we know if we need transformers tokenizer_model: dict[str, Any] = tokenizer_json['model'] - if ( + is_llama3 = ( + tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False) + and not tokenizer_model.get('byte_fallback', True) + ) + if is_llama3: + raise TypeError('Llama 3 must be converted with BpeVocab') + + if not is_llama3 and ( tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False) or tokenizer_json['decoder']['type'] != 'Sequence' ): -- cgit v1.2.3