summaryrefslogtreecommitdiff
path: root/convert.py
diff options
context:
space:
mode:
Diffstat (limited to 'convert.py')
-rwxr-xr-xconvert.py9
1 files changed, 8 insertions, 1 deletions
diff --git a/convert.py b/convert.py
index 24df0a4d..1c700cf6 100755
--- a/convert.py
+++ b/convert.py
@@ -525,7 +525,14 @@ class LlamaHfVocab(Vocab):
# pre-check so we know if we need transformers
tokenizer_model: dict[str, Any] = tokenizer_json['model']
- if (
+ is_llama3 = (
+ tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
+ and not tokenizer_model.get('byte_fallback', True)
+ )
+ if is_llama3:
+ raise TypeError('Llama 3 must be converted with BpeVocab')
+
+ if not is_llama3 and (
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'Sequence'
):