diff options
author | Iwan Kawrakow <iwan.kawrakow@gmail.com> | 2024-06-19 16:46:23 +0300 |
---|---|---|
committer | Iwan Kawrakow <iwan.kawrakow@gmail.com> | 2024-06-22 12:02:52 +0300 |
commit | 58d9e8f1d2efba4b6717043f7a5167be670a6f2e (patch) | |
tree | bc70f7b1197e9572c3efdfa84d349b729c41cf9b /convert-hf-to-gguf.py | |
parent | 927e251a12fa287e13c6bd9667ee97d783486c09 (diff) |
bitnet: put the scale in a separate tensor
and correspondingly add an extra ggml_mul_mat operation.
As per @ggerganov, this is how things should be done.
It seems to be working, but as far as I can tell this
results in a ~15% performance penalty for prompt processing.
Commiting so I can go and test on othe platforms.
Diffstat (limited to 'convert-hf-to-gguf.py')
-rwxr-xr-x | convert-hf-to-gguf.py | 38 |
1 files changed, 34 insertions, 4 deletions
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 0c08b800..ebd36a9b 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1416,17 +1416,47 @@ class BitnetModel(Model): dtype = weight.dtype weight = weight.float() s = 1 / weight.abs().mean().clamp(min=1e-5) - result = (weight * s).round().clamp(-1, 1) / s - return result.type(dtype) + weight = (weight * s).round().clamp(-1, 1) / s + scale = weight.abs().max().unsqueeze(0) + weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype) + weight = torch.sign(weight).type(dtype) + return weight.type(dtype), scale.type(torch.float32) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # transform weight into 1/0/-1 (in fp32) if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", "down_proj.weight", "up_proj.weight", "gate_proj.weight", "o_proj.weight")): - data_torch = self.weight_quant(data_torch) + weight_torch, scale_torch = self.weight_quant(data_torch) - return [(self.map_tensor_name(name), data_torch)] + tensors: list[tuple[str, Tensor]] = [] + + if name.endswith("q_proj.weight"): + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), weight_torch)) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid, suffix=".scale"), scale_torch)) + elif name.endswith("k_proj.weight"): + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), weight_torch)) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid, suffix=".scale"), scale_torch)) + elif name.endswith("v_proj.weight"): + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), weight_torch)) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid, suffix=".scale"), scale_torch)) + elif name.endswith("o_proj.weight"): + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), weight_torch)) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid, suffix=".scale"), scale_torch)) + elif name.endswith("up_proj.weight"): + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), weight_torch)) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid, suffix=".scale"), scale_torch)) + elif name.endswith("down_proj.weight"): + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), weight_torch)) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid, suffix=".scale"), scale_torch)) + elif name.endswith("gate_proj.weight"): + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), weight_torch)) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid, suffix=".scale"), scale_torch)) + + if len(tensors) == 0: + tensors.append((self.map_tensor_name(name), data_torch)) + + return tensors @Model.register("GrokForCausalLM") |