summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKerfuffle <44031344+KerfuffleV2@users.noreply.github.com>2023-10-04 08:20:28 -0600
committerGitHub <noreply@github.com>2023-10-04 17:20:28 +0300
commit019ba1dcd0c7775a5ac0f7442634a330eb0173cc (patch)
tree17e5ebd836d212dfb97c85f72a7008d35f4a385e
parentbeabc8cfb0145b48aad68fefc573d316fe9c3a8a (diff)
convert : fix Baichuan2 models by using vocab size in config.json (#3299)
Use local GGUF package when possible in Baichuan converter
-rwxr-xr-xconvert-baichuan-hf-to-gguf.py10
1 files changed, 8 insertions, 2 deletions
diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py
index 8bd34dc4..513a7516 100755
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@@ -11,11 +11,14 @@ import sys
from pathlib import Path
from typing import TYPE_CHECKING, Any
import itertools
-import gguf
import numpy as np
import torch
from sentencepiece import SentencePieceProcessor # type: ignore[import]
+if 'NO_LOCAL_GGUF' not in os.environ:
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+import gguf
+
if TYPE_CHECKING:
from typing import TypeAlias
@@ -174,8 +177,11 @@ if not tokenizer_model_file.is_file():
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
+vocab_size = hparams.get('vocab_size')
+if vocab_size is None:
+ vocab_size = tokenizer.vocab_size()
-for i in range(tokenizer.vocab_size()):
+for i in range(vocab_size):
text: bytes
score: float