From fe625c621d033bace55bd9e09455d4af5803a2ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 17 Aug 2025 11:23:16 +0200 Subject: [PATCH 1/2] force patch_embd weights to f32 --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bd21e55f4a90c..f1ff84c8db362 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -302,7 +302,7 @@ def prepare_tensors(self): data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims) # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors - if n_dims <= 1 or new_name.endswith("_norm.weight"): + if n_dims <= 1 or new_name.endswith("_norm.weight") or ".patch_embd.weight" in new_name: data_qtype = gguf.GGMLQuantizationType.F32 # Conditions should closely match those in llama_model_quantize_internal in llama.cpp From 28722d164fe80546c063f08837bb502c47afe4e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 17 Aug 2025 12:22:46 +0200 Subject: [PATCH 2/2] use MmprojModel base tensor_force_quant instead --- convert_hf_to_gguf.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f1ff84c8db362..b45c8f1d7feb1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -302,7 +302,7 @@ def prepare_tensors(self): data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims) # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors - if n_dims <= 1 or new_name.endswith("_norm.weight") or ".patch_embd.weight" in new_name: + if n_dims <= 1 or new_name.endswith("_norm.weight"): data_qtype = gguf.GGMLQuantizationType.F32 # Conditions should closely match those in llama_model_quantize_internal in llama.cpp @@ -1334,6 +1334,12 @@ def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = return None raise KeyError(f"could not find any of: {keys}") + def tensor_force_quant(self, name, new_name, bid, n_dims): + del bid, name, n_dims # unused + if ".patch_embd.weight" in new_name: + return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 + return False + @ModelBase.register("GPTNeoXForCausalLM") class GPTNeoXModel(TextModel): @@ -2305,10 +2311,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_use_gelu(True) def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, new_name, n_dims # unused if ".embeddings." in name: return gguf.GGMLQuantizationType.F32 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -3296,12 +3301,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6)) def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, name, n_dims # unused - if ".patch_embd." in new_name: - return gguf.GGMLQuantizationType.F16 if ".position_embd." in new_name: return gguf.GGMLQuantizationType.F32 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -3374,10 +3376,9 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: yield ("audio_tower.embed_positions.weight", pos_embd) def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, new_name, n_dims # unused if ".conv" in name and ".weight" in name: return gguf.GGMLQuantizationType.F16 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name.startswith("thinker."): @@ -3423,12 +3424,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio)) def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, name, n_dims # unused - if ".patch_embd." in new_name: - return gguf.GGMLQuantizationType.F16 if ".position_embd." in new_name: return gguf.GGMLQuantizationType.F32 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def _mapping_interns1_name(self, name): names_map = { @@ -5062,13 +5060,12 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor) def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, new_name, n_dims # unused # related to https://github.com/ggml-org/llama.cpp/issues/13025 if "input_projection" in name: return gguf.GGMLQuantizationType.F16 if ".embeddings." in name: return gguf.GGMLQuantizationType.F32 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -7727,10 +7724,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, new_name, n_dims # unused if ".conv" in name and ".weight" in name: return gguf.GGMLQuantizationType.F16 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused