1 files changed, 40 insertions, 20 deletions
diff --git a/convert.py b/convert.py
index 5b6344aa..3ad836ce 100755..100644
--- a/convert.py
+++ b/convert.py
@@ -46,6 +46,7 @@ DEFAULT_CONCURRENCY = 8
 # data types
 #
 
+
 @dataclass(frozen=True)
 class DataType:
     name: str
@@ -55,15 +56,18 @@ class DataType:
     def elements_to_bytes(self, n_elements: int) -> int:
         return n_elements * self.dtype.itemsize
 
+
 @dataclass(frozen=True)
 class UnquantizedDataType(DataType):
     pass
 
+
 DT_F16  = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
 DT_F32  = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
 DT_I32  = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
 DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
 
+
 @dataclass(frozen=True)
 class QuantizedDataType(DataType):
     block_size: int
@@ -77,6 +81,7 @@ class QuantizedDataType(DataType):
         assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
         return self.quantized_dtype.itemsize * (n_elements // self.block_size)
 
+
 @dataclass(frozen=True)
 class Q8_0QuantizedDataType(QuantizedDataType):
     # Mini Q8_0 quantization in Python!
@@ -86,6 +91,7 @@ class Q8_0QuantizedDataType(QuantizedDataType):
         n_blocks = arr.size // self.block_size
         blocks = arr.reshape((n_blocks, self.block_size))
         # Much faster implementation of block quantization contributed by @Cebtenzzre
+
         def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
             d = abs(blocks).max(axis = 1) / np.float32(127)
             with np.errstate(divide = 'ignore'):
@@ -94,10 +100,11 @@ class Q8_0QuantizedDataType(QuantizedDataType):
             yield from zip(d, qs)
         return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
 
+
 DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
-    dtype = np.dtype(np.float32), valid_conversions = [],
-    ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
-    quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
+                                dtype = np.dtype(np.float32), valid_conversions = [],
+                                ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
+                                quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
 
 # Quantized types skipped here because they may also map to np.float32
 NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
@@ -116,6 +123,8 @@ SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
 # TODO: match this with `llama_ftype`
 # TODO: rename to LLAMAFileType
 # TODO: move to `gguf.py`
+
+
 class GGMLFileType(enum.IntEnum):
     AllF32     = 0
     MostlyF16  = 1  # except 1d tensors
@@ -128,6 +137,7 @@ class GGMLFileType(enum.IntEnum):
         # 1D tensors are always F32.
         return dt if len(tensor.shape) > 1 else DT_F32
 
+
 GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
     GGMLFileType.AllF32    : DT_F32,
     GGMLFileType.MostlyF16 : DT_F16,
@@ -138,6 +148,7 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
 # hparams loading
 #
 
+
 @dataclass
 class Params:
     n_vocab:    int
@@ -167,11 +178,11 @@ class Params:
 
         # try transformer naming first
         if "model.layers.0.self_attn.q_proj.weight" in model:
-            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
+            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
         elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
-            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
+            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
         else:
-            n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
+            n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
 
         if n_layer < 1:
             raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
@@ -308,7 +319,7 @@ class BpeVocab:
                     (item['content'], item['id'])
                     for item in tokenizer_json.get('added_tokens', [])
                     # Added tokens here can be duplicates of the main vocabulary.
-                    if item['content'] not in self.bpe_tokenizer )
+                    if item['content'] not in self.bpe_tokenizer)
 
         vocab_size: int = len(self.bpe_tokenizer)
         expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
@@ -326,7 +337,6 @@ class BpeVocab:
 
     def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         tokenizer = self.bpe_tokenizer
-        from transformers.models.gpt2 import tokenization_gpt2
         reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
 
         for i, _ in enumerate(tokenizer):
@@ -406,6 +416,7 @@ class SentencePieceVocab:
     def __repr__(self) -> str:
         return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 
+
 Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
 
 #
@@ -413,13 +424,14 @@ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
 # TODO: reuse (probably move to gguf.py?)
 #
 
+
 def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
-    #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
+    # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
     if n_head_kv is not None and n_head != n_head_kv:
         n_head = n_head_kv
     return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-                .swapaxes(1, 2)
-                .reshape(weights.shape))
+            .swapaxes(1, 2)
+            .reshape(weights.shape))
 
 
 class Tensor(metaclass=ABCMeta):
@@ -500,7 +512,7 @@ class LazyTensor:
         ret = self._load()
         # Should be okay if it maps to the same numpy type?
         assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
-                (self.data_type, ret.data_type, self.description)
+            (self.data_type, ret.data_type, self.description)
         return ret
 
     def astype(self, data_type: DataType) -> LazyTensor:
@@ -588,6 +600,7 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTe
         return lazy_tensor.load().permute(n_head, n_head_kv)
     return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
 
+
 def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
     def load() -> Tensor:
         return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
@@ -595,6 +608,7 @@ def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_
     s[0] = s[0] // 3
     return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
 
+
 def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
     def load() -> Tensor:
         return lazy_tensor.load().part(n_part)
@@ -744,6 +758,7 @@ def lazy_load_file(path: Path) -> ModelPlus:
 In = TypeVar('In')
 Out = TypeVar('Out')
 
+
 def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
     '''Parallel map, but with backpressure.  If the caller doesn't call `next`
     fast enough, this will stop calling `func` at some point rather than
@@ -778,6 +793,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
                     break
             yield result
 
+
 def check_vocab_size(params: Params, vocab: Vocab) -> None:
     if params.n_vocab != vocab.vocab_size:
         assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
@@ -796,7 +812,7 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
 
 
 class OutputFile:
-    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
         self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
 
     def add_meta_arch(self, params: Params) -> None:
@@ -876,7 +892,7 @@ class OutputFile:
         self.gguf.close()
 
     @staticmethod
-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
+    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
         check_vocab_size(params, vocab)
 
         of = OutputFile(fname_out, endianess=endianess)
@@ -938,8 +954,9 @@ class OutputFile:
 
         of.close()
 
+
 def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
-    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
+    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
 
     if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
         return GGMLFileType.AllF32
@@ -952,10 +969,12 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
 
     raise Exception(f"Unexpected combination of types: {name_to_type}")
 
+
 def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
     return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
             for (name, tensor) in model.items()}
 
+
 def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
     tmap = gguf.TensorNameMap(ARCH, params.n_layer)
     should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
@@ -968,7 +987,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
             print(f"Permuting layer {i}")
             tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
             tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
-           #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] =              model[f"model.layers.{i}.self_attn.v_proj.weight"]
+            # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] =              model[f"model.layers.{i}.self_attn.v_proj.weight"]
         elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
             print(f"Unpacking and permuting layer {i}")
             tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
@@ -993,6 +1012,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
 
     return out
 
+
 def nth_multifile_path(path: Path, n: int) -> Path | None:
     '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
     the nth path in the model.
@@ -1174,8 +1194,8 @@ def main(args_in: list[str] | None = None) -> None:
         # FIXME: Try to respect vocab_dir somehow?
         vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
         special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
-            load_merges = args.vocabtype == 'bpe',
-            n_vocab = vocab.vocab_size)
+                                          load_merges = args.vocabtype == 'bpe',
+                                          n_vocab = vocab.vocab_size)
         outfile = args.outfile
         OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
         print(f"Wrote {outfile}")
@@ -1188,8 +1208,8 @@ def main(args_in: list[str] | None = None) -> None:
         vocab = load_vocab(vocab_dir, args.vocabtype)
     # FIXME: Try to respect vocab_dir somehow?
     special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
-        load_merges = args.vocabtype == 'bpe',
-        n_vocab = vocab.vocab_size)
+                                      load_merges = args.vocabtype == 'bpe',
+                                      n_vocab = vocab.vocab_size)
 
     model   = model_plus.model
     model   = convert_model_names(model, params)