[GGUF] Refactor and decouple gguf checkpoint loading logic (#34385)

* draft load_gguf refactor * update Signed-off-by: Isotr0py <2037008807@qq.com> * remove llama mapping Signed-off-by: Isotr0py <2037008807@qq.com> * remove qwen2 mapping Signed-off-by: Isotr0py <2037008807@qq.com> * remove unused function Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate stablelm mapping Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate phi3 mapping Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate t5 mapping Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate bloom mapping Signed-off-by: Isotr0py <2037008807@qq.com> * fix bloom Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate starcoder2 mapping Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate gpt2 mapping Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate mistral mapping Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate nemotron mapping Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate mamba mapping Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate mamba mapping Signed-off-by: Isotr0py <2037008807@qq.com> * code format Signed-off-by: Isotr0py <2037008807@qq.com> * code format Signed-off-by: Isotr0py <2037008807@qq.com> * fix mamba Signed-off-by: Isotr0py <2037008807@qq.com> * fix qwen2moe Signed-off-by: Isotr0py <2037008807@qq.com> * remove qwen2moe mapping Signed-off-by: Isotr0py <2037008807@qq.com> * clean up Signed-off-by: Isotr0py <2037008807@qq.com> * remove falcon 7b map Signed-off-by: Isotr0py <2037008807@qq.com> * remove all ggml tensors mapping Signed-off-by: Isotr0py <2037008807@qq.com> * add comments Signed-off-by: Isotr0py <2037008807@qq.com> * update messages Signed-off-by: Isotr0py <2037008807@qq.com> * fix tensors in parsed parameters Signed-off-by: Isotr0py <2037008807@qq.com> * add gguf check Signed-off-by: Isotr0py <2037008807@qq.com> --------- Signed-off-by: Isotr0py <2037008807@qq.com>
2026-05-14 20:58:08 +00:00 · 2025-01-07 01:02:38 +08:00 · 2025-01-07 01:02:38 +08:00 · 3951da1a6b
commit 3951da1a6b
parent 86fa3cedad
4 changed files with 92 additions and 293 deletions
--- a/src/transformers/integrations/init.py
+++ b/src/transformers/integrations/init.py
@ -57,7 +57,6 @@ _import_structure = {
    "fsdp": ["is_fsdp_managed_module"],
    "ggml": [
        "GGUF_CONFIG_MAPPING",
-        "GGUF_TENSOR_MAPPING",
        "GGUF_TOKENIZER_MAPPING",
        "_gguf_parse_value",
        "load_dequant_gguf_tensor",
@ -161,7 +160,6 @@ if TYPE_CHECKING:
    from .fsdp import is_fsdp_managed_module
    from .ggml import (
        GGUF_CONFIG_MAPPING,
-        GGUF_TENSOR_MAPPING,
        GGUF_TOKENIZER_MAPPING,
        _gguf_parse_value,
        load_dequant_gguf_tensor,
--- a/src/transformers/integrations/ggml.py
+++ b/src/transformers/integrations/ggml.py
@ -33,254 +33,6 @@ from ..utils.logging import tqdm
 logger = logging.get_logger(__name__)


-GGUF_TENSOR_MAPPING = {
-    "llama": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up": "mlp.up_proj",
-        "ffn_down": "mlp.down_proj",
-        "ffn_gate": "mlp.gate_proj",
-        "ffn_norm": "post_attention_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_q": "self_attn.q_proj",
-        "attn_v": "self_attn.v_proj",
-        "attn_k": "self_attn.k_proj",
-        "attn_output": "self_attn.o_proj",
-        "output.weight": "lm_head.weight",
-        "output_norm": "model.norm",
-    },
-    "mistral": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up": "mlp.up_proj",
-        "ffn_down": "mlp.down_proj",
-        "ffn_gate": "mlp.gate_proj",
-        "ffn_norm": "post_attention_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_q": "self_attn.q_proj",
-        "attn_v": "self_attn.v_proj",
-        "attn_k": "self_attn.k_proj",
-        "attn_output": "self_attn.o_proj",
-        "output.weight": "lm_head.weight",
-        "output_norm": "model.norm",
-    },
-    "qwen2": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up": "mlp.up_proj",
-        "ffn_down": "mlp.down_proj",
-        "ffn_gate": "mlp.gate_proj",
-        "ffn_norm": "post_attention_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_q": "self_attn.q_proj",
-        "attn_v": "self_attn.v_proj",
-        "attn_k": "self_attn.k_proj",
-        "attn_output": "self_attn.o_proj",
-        "output.weight": "lm_head.weight",
-        "output_norm": "model.norm",
-    },
-    "qwen2moe": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up_exps": "mlp.experts",
-        "ffn_up_shexp": "mlp.shared_expert.up_proj",
-        "ffn_down_exps": "mlp.experts",
-        "ffn_down_shexp": "mlp.shared_expert.down_proj",
-        "ffn_norm": "post_attention_layernorm",
-        "ffn_gate_inp.weight": "mlp.gate.weight",
-        "ffn_gate_exps": "mlp.experts",
-        "ffn_gate_shexp": "mlp.shared_expert.gate_proj",
-        "ffn_gate_inp_shexp": "mlp.shared_expert_gate",
-        "attn_norm": "input_layernorm",
-        "attn_q": "self_attn.q_proj",
-        "attn_v": "self_attn.v_proj",
-        "attn_k": "self_attn.k_proj",
-        "attn_output": "self_attn.o_proj",
-        "output.weight": "lm_head.weight",
-        "output_norm": "model.norm",
-    },
-    "phi3": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up": "mlp.gate_up_proj",
-        "ffn_down": "mlp.down_proj",
-        "ffn_gate": "mlp.gate_up_proj",
-        "ffn_norm": "post_attention_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_qkv": "self_attn.qkv_proj",
-        "attn_output": "self_attn.o_proj",
-        "output.weight": "lm_head.weight",
-        "output_norm": "model.norm",
-    },
-    "bloom": {
-        "token_embd.weight": "transformer.word_embeddings.weight",
-        "token_embd_norm": "transformer.word_embeddings_layernorm",
-        "blk": "transformer.h",
-        "ffn_up": "mlp.dense_h_to_4h",
-        "ffn_down": "mlp.dense_4h_to_h",
-        "ffn_norm": "post_attention_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_qkv": "self_attention.query_key_value",
-        "attn_output": "self_attention.dense",
-        "output.weight": "lm_head.weight",
-        "output_norm": "transformer.ln_f",
-    },
-    "falcon7b": {
-        "token_embd": "word_embeddings",
-        "blk": "h",
-        "ffn_up": "mlp.dense_h_to_4h",
-        "ffn_down": "mlp.dense_4h_to_h",
-        "attn_norm": "input_layernorm",
-        "attn_qkv": "self_attention.query_key_value",
-        "attn_output": "self_attention.dense",
-        ".output.": ".lm_head.",
-        "output_norm": "ln_f",
-    },
-    "falcon40b": {
-        "token_embd": "word_embeddings",
-        "blk": "h",
-        "ffn_up": "mlp.dense_h_to_4h",
-        "ffn_down": "mlp.dense_4h_to_h",
-        ".attn_norm.": ".ln_mlp.",
-        "attn_norm_2": "ln_attn",
-        "attn_qkv": "self_attention.query_key_value",
-        "attn_output": "self_attention.dense",
-        ".output.": ".lm_head.",
-        "output_norm": "ln_f",
-    },
-    "t5": {
-        "token_embd": "shared",
-        "dec.blk.{bid}.attn_q": "decoder.block.{bid}.layer.0.SelfAttention.q",
-        "dec.blk.{bid}.attn_k": "decoder.block.{bid}.layer.0.SelfAttention.k",
-        "dec.blk.{bid}.attn_v": "decoder.block.{bid}.layer.0.SelfAttention.v",
-        "dec.blk.{bid}.attn_o": "decoder.block.{bid}.layer.0.SelfAttention.o",
-        "dec.blk.{bid}.attn_rel_b": "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias",
-        "dec.blk.{bid}.attn_norm": "decoder.block.{bid}.layer.0.layer_norm",
-        "dec.blk.{bid}.cross_attn_q": "decoder.block.{bid}.layer.1.EncDecAttention.q",
-        "dec.blk.{bid}.cross_attn_k": "decoder.block.{bid}.layer.1.EncDecAttention.k",
-        "dec.blk.{bid}.cross_attn_v": "decoder.block.{bid}.layer.1.EncDecAttention.v",
-        "dec.blk.{bid}.cross_attn_o": "decoder.block.{bid}.layer.1.EncDecAttention.o",
-        "dec.blk.{bid}.cross_attn_norm": "decoder.block.{bid}.layer.1.layer_norm",
-        "dec.blk.{bid}.ffn_gate": "decoder.block.{bid}.layer.2.DenseReluDense.wi_0",
-        "dec.blk.{bid}.ffn_up": "decoder.block.{bid}.layer.2.DenseReluDense.wi_1",
-        "dec.blk.{bid}.ffn_down": "decoder.block.{bid}.layer.2.DenseReluDense.wo",
-        "dec.blk.{bid}.ffn_norm": "decoder.block.{bid}.layer.2.layer_norm",
-        "dec.output_norm": "decoder.final_layer_norm",
-        "enc.blk.{bid}.attn_q": "encoder.block.{bid}.layer.0.SelfAttention.q",
-        "enc.blk.{bid}.attn_k": "encoder.block.{bid}.layer.0.SelfAttention.k",
-        "enc.blk.{bid}.attn_v": "encoder.block.{bid}.layer.0.SelfAttention.v",
-        "enc.blk.{bid}.attn_o": "encoder.block.{bid}.layer.0.SelfAttention.o",
-        "enc.blk.{bid}.attn_rel_b": "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias",
-        "enc.blk.{bid}.attn_norm": "encoder.block.{bid}.layer.0.layer_norm",
-        "enc.blk.{bid}.ffn_gate": "encoder.block.{bid}.layer.1.DenseReluDense.wi_0",
-        "enc.blk.{bid}.ffn_up": "encoder.block.{bid}.layer.1.DenseReluDense.wi_1",
-        "enc.blk.{bid}.ffn_down": "encoder.block.{bid}.layer.1.DenseReluDense.wo",
-        "enc.blk.{bid}.ffn_norm": "encoder.block.{bid}.layer.1.layer_norm",
-        "enc.output_norm": "encoder.final_layer_norm",
-        "output.weight": "lm_head.weight",
-    },
-    "t5encoder": {
-        "token_embd": "shared",
-        "enc.blk.{bid}.attn_q": "encoder.block.{bid}.layer.0.SelfAttention.q",
-        "enc.blk.{bid}.attn_k": "encoder.block.{bid}.layer.0.SelfAttention.k",
-        "enc.blk.{bid}.attn_v": "encoder.block.{bid}.layer.0.SelfAttention.v",
-        "enc.blk.{bid}.attn_o": "encoder.block.{bid}.layer.0.SelfAttention.o",
-        "enc.blk.{bid}.attn_rel_b": "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias",
-        "enc.blk.{bid}.attn_norm": "encoder.block.{bid}.layer.0.layer_norm",
-        "enc.blk.{bid}.ffn_gate": "encoder.block.{bid}.layer.1.DenseReluDense.wi_0",
-        "enc.blk.{bid}.ffn_up": "encoder.block.{bid}.layer.1.DenseReluDense.wi_1",
-        "enc.blk.{bid}.ffn_down": "encoder.block.{bid}.layer.1.DenseReluDense.wo",
-        "enc.blk.{bid}.ffn_norm": "encoder.block.{bid}.layer.1.layer_norm",
-        "enc.output_norm": "encoder.final_layer_norm",
-    },
-    "stablelm": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up": "mlp.up_proj",
-        "ffn_down": "mlp.down_proj",
-        "ffn_gate": "mlp.gate_proj",
-        "ffn_norm": "post_attention_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_q": "self_attn.q_proj",
-        "attn_v": "self_attn.v_proj",
-        "attn_k": "self_attn.k_proj",
-        "attn_output": "self_attn.o_proj",
-        "output.weight": "lm_head.weight",
-        "output_norm": "model.norm",
-    },
-    "gpt2": {
-        "token_embd": "transformer.wte",
-        "blk": "transformer.h",
-        "position_embd": "transformer.wpe",
-        "output_norm": "transformer.ln_f",
-        "attn_norm": "ln_1",
-        "attn_qkv": "attn.c_attn",
-        "attn_output.weight": "attn.c_proj.weight",
-        "attn_output.bias": "attn.c_proj.bias",
-        "ffn_norm": "ln_2",
-        "ffn_up": "mlp.c_fc",
-        "ffn_down": "mlp.c_proj",
-    },
-    "starcoder2": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up": "mlp.c_fc",
-        "ffn_down": "mlp.c_proj",
-        "ffn_norm": "post_attention_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_q": "self_attn.q_proj",
-        "attn_v": "self_attn.v_proj",
-        "attn_k": "self_attn.k_proj",
-        "attn_output": "self_attn.o_proj",
-        "output.weight": "lm_head.weight",
-        "output_norm": "model.norm",
-    },
-    "mamba": {
-        "token_embd": "backbone.embeddings",
-        "blk": "backbone.layers",
-        "ssm_a": "mixer.A_log",
-        "ssm_conv1d": "mixer.conv1d",
-        "ssm_in": "mixer.in_proj",
-        "ssm_out": "mixer.out_proj",
-        "ssm_x": "mixer.x_proj",
-        "ssm_dt": "mixer.dt_proj",
-        "attn_norm": "norm",
-        "output_norm": "backbone.norm_f",
-        "output.weight": "lm_head.weight",
-    },
-    "nemotron": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up": "mlp.up_proj",
-        "ffn_down": "mlp.down_proj",
-        "ffn_norm": "post_attention_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_q": "self_attn.q_proj",
-        "attn_v": "self_attn.v_proj",
-        "attn_k": "self_attn.k_proj",
-        "attn_output": "self_attn.o_proj",
-        "output.weight": "lm_head.weight",
-        "output_norm": "model.norm",
-    },
-    "gemma2": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up": "mlp.up_proj",
-        "ffn_down": "mlp.down_proj",
-        "ffn_gate": "mlp.gate_proj",
-        "ffn_norm": "pre_feedforward_layernorm",
-        "post_attention_norm": "post_attention_layernorm",
-        "post_ffw_norm": "post_feedforward_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_q": "self_attn.q_proj",
-        "attn_v": "self_attn.v_proj",
-        "attn_k": "self_attn.k_proj",
-        "attn_output": "self_attn.o_proj",
-        "output_norm": "model.norm",
-    },
-}
-
-
 GGUF_CONFIG_MAPPING = {
    "general": {
        "architecture": "model_type",
--- a/src/transformers/modeling_gguf_pytorch_utils.py
+++ b/src/transformers/modeling_gguf_pytorch_utils.py
@ -22,7 +22,6 @@ from tqdm import tqdm

 from .integrations import (
    GGUF_CONFIG_MAPPING,
-    GGUF_TENSOR_MAPPING,
    GGUF_TOKENIZER_MAPPING,
    _gguf_parse_value,
 )
@ -47,12 +46,11 @@ GGUF_TO_TRANSFORMERS_MAPPING = {
        "general": {"file_type": "file_type", "quantization_version": "quantization_version"},
    },
    "config": GGUF_CONFIG_MAPPING,
-    "tensors": GGUF_TENSOR_MAPPING,
    "tokenizer": {"tokenizer": GGUF_TOKENIZER_MAPPING["tokenizer"]},
    "tokenizer_config": {"tokenizer": GGUF_TOKENIZER_MAPPING["tokenizer_config"]},
 }

-GGUF_SUPPORTED_ARCHITECTURES = list(GGUF_TO_TRANSFORMERS_MAPPING["tensors"].keys())
+GGUF_SUPPORTED_ARCHITECTURES = list(GGUF_TO_TRANSFORMERS_MAPPING["config"].keys())


 class GGUFTensor(NamedTuple):
@ -121,21 +119,10 @@ class Qwen2MoeTensorProcessor(TensorProcessor):
    ):
        # Original merge implementation
        # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L1994-L2022
-        exp_name = ""
-        if "ffn_gate_exps" in name:
-            exp_name = "gate_proj"
-        elif "ffn_down_exps" in name:
-            exp_name = "down_proj"
-        elif "ffn_up_exps" in name:
-            exp_name = "up_proj"
-        else:
-            raise ValueError(f"Cannot map expert tensor {name} in Qwen2Moe architecture.")
-        for tensor_name in tensor_key_mapping:
-            if tensor_name in name:
-                name = name.replace(tensor_name, tensor_key_mapping[tensor_name])
+        name = tensor_key_mapping[name]
        w_counter = self.config.get("num_experts", 60)
        for i in range(0, w_counter):
-            temp_name = name.replace(".weight", f".{i}.{exp_name}.weight")
+            temp_name = name.replace("mlp.experts.", f"mlp.experts.{i}.")
            exp_weight = weights[i]
            parsed_parameters["tensors"][temp_name] = torch.from_numpy(np.copy(exp_weight))

@ -223,10 +210,6 @@ class MambaTensorProcessor(TensorProcessor):
        super().__init__(config=config)

    def process(self, weights, name, **kwargs):
-        if "ssm_d" in name and "bias" not in name and "weight" not in name:
-            # ssm_d has conflicts with ssm_dt in name checking
-            # we have to explicitly check that name is exactly ssm_d
-            name = name.replace("ssm_d", "mixer.D")
        if "ssm_conv1d.weight" in name:
            # for compatibility tensor ssm_conv1d must be (5120, 1, 4]) dim,
            # quantized one is (5120, 4)
@ -267,7 +250,84 @@ def read_field(reader, field):
    return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data]


-def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
+# modified from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/model_loader/loader.py#L1115-L1147
+def get_gguf_hf_weights_map(
+    hf_model,
+    model_type: Optional[str] = None,
+    num_layers: Optional[int] = None,
+    qual_name: str = "",
+):
+    """
+    GGUF uses this naming convention for their tensors from HF checkpoint:
+    `blk.N.BB.weight` and `blk.N.BB.bias`
+    where N signifies the block number of a layer, and BB signifies the
+    attention/mlp layer components.
+    See "Standardized tensor names" in
+    https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details.
+    """
+    if is_gguf_available() and is_torch_available():
+        from gguf import MODEL_ARCH_NAMES, get_tensor_name_map
+    else:
+        logger.error(
+            "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see "
+            "https://pytorch.org/ and https://github.com/ggerganov/llama.cpp/tree/master/gguf-py for installation instructions."
+        )
+        raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.")
+
+    model_type = hf_model.config.model_type if model_type is None else model_type
+    num_layers = hf_model.config.num_hidden_layers if num_layers is None else num_layers
+    # hack: ggufs have a different name for cohere
+    if model_type == "cohere":
+        model_type = "command-r"
+    if model_type == "qwen2_moe":
+        model_type = "qwen2moe"
+    arch = None
+    for key, value in MODEL_ARCH_NAMES.items():
+        if value == model_type:
+            arch = key
+            break
+    if arch is None:
+        raise NotImplementedError(
+            f"Unknown gguf model_type: {model_type} in gguf-py. "
+            "This might because you're using an outdated version of gguf-py package, "
+            "you can install `gguf` package from source refer to "
+            "https://github.com/ggerganov/llama.cpp/tree/master/gguf-py#development"
+        )
+    name_map = get_tensor_name_map(arch, num_layers)
+
+    # Use a dummy conversion to get the mapping, because
+    # hf => gguf and gguf => hf mappings are reversed
+    gguf_to_hf_name_map = {}
+    state_dict = hf_model.state_dict()
+    for hf_name in state_dict.keys():
+        # An exception for qwen2moe model, where the expert layers are packed
+        if model_type == "qwen2moe" and "mlp.experts." in hf_name:
+            hf_name = re.sub(r"mlp.experts.\d+.", "mlp.experts.", hf_name)
+
+        name, suffix = hf_name, ""
+        if hf_name.endswith(".weight") or hf_name.endswith(".bias"):
+            name, suffix = hf_name.rsplit(".", 1)
+            suffix = "." + suffix
+
+        gguf_name = name_map.get_name(name)
+        if gguf_name is None:
+            continue
+
+        gguf_to_hf_name_map[gguf_name + suffix] = qual_name + hf_name
+
+    # Some model like Bloom converted from BloomModel instead of BloomForCausalLM
+    # Therefore, we need to check submodule as well to get a correct mapping
+    if named_children := hf_model.named_children():
+        for name, child in named_children:
+            sub_map = get_gguf_hf_weights_map(child, model_type, num_layers, qual_name=f"{qual_name}{name}.")
+            # Ignore the keys that are already in the main map to avoid overwriting
+            sub_map = {k: v for k, v in sub_map.items() if k not in gguf_to_hf_name_map}
+            gguf_to_hf_name_map.update(sub_map)
+
+    return gguf_to_hf_name_map
+
+
+def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_load=None):
    """
    Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed
    tokenizer and config attributes.
@ -323,20 +383,8 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
        parsed_parameters["config"]["use_qkv_bias"] = qkv_bias
        parsed_parameters["config"]["use_parallel_residual"] = not use_parallel_residual

-    model_size = ""
-    # extract the number of params from file name as architectures can differ ;
-    # eg. for falcon : `...falcon-7b-...`
-    if "falcon" in architecture:
-        gguf_file_name = gguf_checkpoint_path.split("/")[-1].lower()
-        m = re.search(r"-\d+b-", gguf_file_name)  # regex to catch `-7b-`
-        if m is None:
-            raise ValueError(
-                f"From file name, cannot determine the number of parameters for {architecture} architecture"
-            )
-        model_size = m.group().strip("-")  # only keeps `7b`
-
-    if architecture + model_size not in GGUF_SUPPORTED_ARCHITECTURES:
-        raise ValueError(f"Architecture {architecture + model_size} not supported")
+    if architecture not in GGUF_SUPPORTED_ARCHITECTURES:
+        raise ValueError(f"GGUF model with architecture {architecture} is not supported yet.")

    # Handle tie_word_embeddings, if lm_head.weight is not present in tensors,
    # tie_word_embeddings is true otherwise false
@ -388,7 +436,9 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
            )

    if return_tensors:
-        tensor_key_mapping = GGUF_TO_TRANSFORMERS_MAPPING["tensors"][architecture + model_size]
+        parsed_parameters["tensors"] = {}
+
+        tensor_key_mapping = get_gguf_hf_weights_map(model_to_load)
        config = parsed_parameters.get("config", {})

        ProcessorClass = TENSOR_PROCESSORS.get(architecture, TensorProcessor)
@ -407,16 +457,12 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):

            weights = result.weights
            name = result.name
-            bid = result.metadata.get("bid")

-            if name is None:
+            if name not in tensor_key_mapping:
                continue

-            for tensor_name in tensor_key_mapping:
-                if tensor_name.format(bid=bid) in name:
-                    name = name.replace(tensor_name.format(bid=bid), tensor_key_mapping[tensor_name].format(bid=bid))
+            name = tensor_key_mapping[name]

-            # Use copy to avoid errors with numpy and pytorch
            parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights))

    if len(reader_keys) > 0:
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -3917,7 +3917,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix

                gguf_path = cached_file(pretrained_model_name_or_path, gguf_file, **cached_file_kwargs)

-            state_dict = load_gguf_checkpoint(gguf_path, return_tensors=True)["tensors"]
+            # we need a dummy model to help rename state_dict
+            with torch.device("meta"):
+                dummy_model = cls(config)
+            state_dict = load_gguf_checkpoint(gguf_path, return_tensors=True, model_to_load=dummy_model)["tensors"]

            resolved_archive_file = None
            is_sharded = False