diff options
author | compilade <git@compilade.net> | 2024-05-08 18:16:38 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-08 18:16:38 -0400 |
commit | f98eb31c517c95960df1d0abc48002787f145f3b (patch) | |
tree | de51a7b79fa5e6488ed4f76b5d0867d6c23d3c51 /convert.py | |
parent | bc4bba364fb96d908f2698e908648df5e6f55e02 (diff) |
convert-hf : save memory with lazy evaluation (#7075)
* convert-hf : begin refactoring write_tensor
* convert : upgrade to sentencepiece v0.2.0
* convert-hf : remove unused n_dims in extra_*_tensors
* convert-hf : simplify MoE weights stacking
* convert-hf : flake8 linter doesn't like semicolons
* convert-hf : allow unusual model part names
For example, loading `model-00001-of-00001.safetensors` now works.
* convert-hf : fix stacking MoE expert tensors
`torch.stack` and `torch.cat` don't do the same thing.
* convert-hf : fix Mamba conversion
Tested to work even with a SentencePiece-based tokenizer.
* convert : use a string for the SentencePiece tokenizer path
* convert-hf : display tensor shape
* convert-hf : convert norms to f32 by default
* convert-hf : sort model part names
`os.listdir` is said to list files in arbitrary order.
Sorting the file names should let "model-00009-of-00042.safetensors"
be loaded before "model-00010-of-00042.safetensors".
* convert-hf : use an ABC for Model again
It seems Protocol can't be used as a statically type-checked ABC,
because its subclasses also can't be instantiated. (why did it seem to work?)
At least there's still a way to throw an error when forgetting to define
the `model_arch` property of any registered Model subclasses.
* convert-hf : use a plain class for Model, and forbid direct instantiation
There are no abstract methods used anyway,
so using ABC isn't really necessary.
* convert-hf : more consistent formatting of cmdline args
* convert-hf : align the message logged for converted tensors
* convert-hf : fix Refact conversion
* convert-hf : save memory with lazy evaluation
* convert-hf : flake8 doesn't like lowercase L as a variable name
* convert-hf : remove einops requirement for InternLM2
* convert-hf : faster model parts loading
Instead of pre-loading them all into a dict, iterate on the tensors
in the model parts progressively as needed in Model.write_tensors
Conversion for some architectures relies on checking for the presence
of specific tensor names, so for multi-part models, the weight map is read
from the relevant json file to quickly get these names up-front.
* convert-hf : minor changes for consistency
* gguf-py : add tqdm as a dependency
It's small, and used for a progress bar
in GGUFWriter.write_tensors_to_file
Diffstat (limited to 'convert.py')
-rwxr-xr-x | convert.py | 20 |
1 files changed, 12 insertions, 8 deletions
@@ -284,6 +284,7 @@ class Params: n_experts = None n_experts_used = None f_rope_freq_base = None + n_ff = None # hack to determine LLaMA v1 vs v2 vs CodeLlama if config.get("moe"): @@ -308,6 +309,8 @@ class Params: n_experts_used = config["moe"]["num_experts_per_tok"] f_rope_freq_base = 1e6 + assert n_ff is not None + return Params( n_vocab = model["tok_embeddings.weight"].shape[0], n_embd = config["dim"], @@ -462,7 +465,8 @@ class SentencePieceVocab(Vocab): # not found in alternate location either raise FileNotFoundError('Cannot find tokenizer.model') - self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) + self.sentencepiece_tokenizer = SentencePieceProcessor() + self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer)) vocab_size = self.sentencepiece_tokenizer.vocab_size() new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} @@ -482,23 +486,23 @@ class SentencePieceVocab(Vocab): def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.sentencepiece_tokenizer for i in range(tokenizer.vocab_size()): - piece = tokenizer.id_to_piece(i) + piece = tokenizer.IdToPiece(i) text = piece.encode("utf-8") - score: float = tokenizer.get_score(i) + score: float = tokenizer.GetScore(i) toktype = gguf.TokenType.NORMAL - if tokenizer.is_unknown(i): + if tokenizer.IsUnknown(i): toktype = gguf.TokenType.UNKNOWN - if tokenizer.is_control(i): + if tokenizer.IsControl(i): toktype = gguf.TokenType.CONTROL # NOTE: I think added_tokens are user defined. # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED - if tokenizer.is_unused(i): + if tokenizer.IsUnused(i): toktype = gguf.TokenType.UNUSED - if tokenizer.is_byte(i): + if tokenizer.IsByte(i): toktype = gguf.TokenType.BYTE yield text, score, toktype @@ -906,7 +910,7 @@ class LazyUnpickler(pickle.Unpickler): def rebuild_from_type_v2(func, new_type, args, state): return func(*args) - CLASSES = { + CLASSES: dict[tuple[str, str], type[LazyTensor] | LazyStorageKind] = { # getattr used here as a workaround for mypy not being smart enough to determine # the staticmethods have a __func__ attribute. ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), |