mirror of
https://github.com/saymrwulf/transformers.git
synced 2026-05-14 20:58:08 +00:00
[GGUF] Refactor and decouple gguf checkpoint loading logic (#34385)
* draft load_gguf refactor * update Signed-off-by: Isotr0py <2037008807@qq.com> * remove llama mapping Signed-off-by: Isotr0py <2037008807@qq.com> * remove qwen2 mapping Signed-off-by: Isotr0py <2037008807@qq.com> * remove unused function Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate stablelm mapping Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate phi3 mapping Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate t5 mapping Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate bloom mapping Signed-off-by: Isotr0py <2037008807@qq.com> * fix bloom Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate starcoder2 mapping Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate gpt2 mapping Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate mistral mapping Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate nemotron mapping Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate mamba mapping Signed-off-by: Isotr0py <2037008807@qq.com> * deprecate mamba mapping Signed-off-by: Isotr0py <2037008807@qq.com> * code format Signed-off-by: Isotr0py <2037008807@qq.com> * code format Signed-off-by: Isotr0py <2037008807@qq.com> * fix mamba Signed-off-by: Isotr0py <2037008807@qq.com> * fix qwen2moe Signed-off-by: Isotr0py <2037008807@qq.com> * remove qwen2moe mapping Signed-off-by: Isotr0py <2037008807@qq.com> * clean up Signed-off-by: Isotr0py <2037008807@qq.com> * remove falcon 7b map Signed-off-by: Isotr0py <2037008807@qq.com> * remove all ggml tensors mapping Signed-off-by: Isotr0py <2037008807@qq.com> * add comments Signed-off-by: Isotr0py <2037008807@qq.com> * update messages Signed-off-by: Isotr0py <2037008807@qq.com> * fix tensors in parsed parameters Signed-off-by: Isotr0py <2037008807@qq.com> * add gguf check Signed-off-by: Isotr0py <2037008807@qq.com> --------- Signed-off-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
parent
86fa3cedad
commit
3951da1a6b
4 changed files with 92 additions and 293 deletions
|
|
@ -57,7 +57,6 @@ _import_structure = {
|
|||
"fsdp": ["is_fsdp_managed_module"],
|
||||
"ggml": [
|
||||
"GGUF_CONFIG_MAPPING",
|
||||
"GGUF_TENSOR_MAPPING",
|
||||
"GGUF_TOKENIZER_MAPPING",
|
||||
"_gguf_parse_value",
|
||||
"load_dequant_gguf_tensor",
|
||||
|
|
@ -161,7 +160,6 @@ if TYPE_CHECKING:
|
|||
from .fsdp import is_fsdp_managed_module
|
||||
from .ggml import (
|
||||
GGUF_CONFIG_MAPPING,
|
||||
GGUF_TENSOR_MAPPING,
|
||||
GGUF_TOKENIZER_MAPPING,
|
||||
_gguf_parse_value,
|
||||
load_dequant_gguf_tensor,
|
||||
|
|
|
|||
|
|
@ -33,254 +33,6 @@ from ..utils.logging import tqdm
|
|||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
GGUF_TENSOR_MAPPING = {
|
||||
"llama": {
|
||||
"token_embd": "model.embed_tokens",
|
||||
"blk": "model.layers",
|
||||
"ffn_up": "mlp.up_proj",
|
||||
"ffn_down": "mlp.down_proj",
|
||||
"ffn_gate": "mlp.gate_proj",
|
||||
"ffn_norm": "post_attention_layernorm",
|
||||
"attn_norm": "input_layernorm",
|
||||
"attn_q": "self_attn.q_proj",
|
||||
"attn_v": "self_attn.v_proj",
|
||||
"attn_k": "self_attn.k_proj",
|
||||
"attn_output": "self_attn.o_proj",
|
||||
"output.weight": "lm_head.weight",
|
||||
"output_norm": "model.norm",
|
||||
},
|
||||
"mistral": {
|
||||
"token_embd": "model.embed_tokens",
|
||||
"blk": "model.layers",
|
||||
"ffn_up": "mlp.up_proj",
|
||||
"ffn_down": "mlp.down_proj",
|
||||
"ffn_gate": "mlp.gate_proj",
|
||||
"ffn_norm": "post_attention_layernorm",
|
||||
"attn_norm": "input_layernorm",
|
||||
"attn_q": "self_attn.q_proj",
|
||||
"attn_v": "self_attn.v_proj",
|
||||
"attn_k": "self_attn.k_proj",
|
||||
"attn_output": "self_attn.o_proj",
|
||||
"output.weight": "lm_head.weight",
|
||||
"output_norm": "model.norm",
|
||||
},
|
||||
"qwen2": {
|
||||
"token_embd": "model.embed_tokens",
|
||||
"blk": "model.layers",
|
||||
"ffn_up": "mlp.up_proj",
|
||||
"ffn_down": "mlp.down_proj",
|
||||
"ffn_gate": "mlp.gate_proj",
|
||||
"ffn_norm": "post_attention_layernorm",
|
||||
"attn_norm": "input_layernorm",
|
||||
"attn_q": "self_attn.q_proj",
|
||||
"attn_v": "self_attn.v_proj",
|
||||
"attn_k": "self_attn.k_proj",
|
||||
"attn_output": "self_attn.o_proj",
|
||||
"output.weight": "lm_head.weight",
|
||||
"output_norm": "model.norm",
|
||||
},
|
||||
"qwen2moe": {
|
||||
"token_embd": "model.embed_tokens",
|
||||
"blk": "model.layers",
|
||||
"ffn_up_exps": "mlp.experts",
|
||||
"ffn_up_shexp": "mlp.shared_expert.up_proj",
|
||||
"ffn_down_exps": "mlp.experts",
|
||||
"ffn_down_shexp": "mlp.shared_expert.down_proj",
|
||||
"ffn_norm": "post_attention_layernorm",
|
||||
"ffn_gate_inp.weight": "mlp.gate.weight",
|
||||
"ffn_gate_exps": "mlp.experts",
|
||||
"ffn_gate_shexp": "mlp.shared_expert.gate_proj",
|
||||
"ffn_gate_inp_shexp": "mlp.shared_expert_gate",
|
||||
"attn_norm": "input_layernorm",
|
||||
"attn_q": "self_attn.q_proj",
|
||||
"attn_v": "self_attn.v_proj",
|
||||
"attn_k": "self_attn.k_proj",
|
||||
"attn_output": "self_attn.o_proj",
|
||||
"output.weight": "lm_head.weight",
|
||||
"output_norm": "model.norm",
|
||||
},
|
||||
"phi3": {
|
||||
"token_embd": "model.embed_tokens",
|
||||
"blk": "model.layers",
|
||||
"ffn_up": "mlp.gate_up_proj",
|
||||
"ffn_down": "mlp.down_proj",
|
||||
"ffn_gate": "mlp.gate_up_proj",
|
||||
"ffn_norm": "post_attention_layernorm",
|
||||
"attn_norm": "input_layernorm",
|
||||
"attn_qkv": "self_attn.qkv_proj",
|
||||
"attn_output": "self_attn.o_proj",
|
||||
"output.weight": "lm_head.weight",
|
||||
"output_norm": "model.norm",
|
||||
},
|
||||
"bloom": {
|
||||
"token_embd.weight": "transformer.word_embeddings.weight",
|
||||
"token_embd_norm": "transformer.word_embeddings_layernorm",
|
||||
"blk": "transformer.h",
|
||||
"ffn_up": "mlp.dense_h_to_4h",
|
||||
"ffn_down": "mlp.dense_4h_to_h",
|
||||
"ffn_norm": "post_attention_layernorm",
|
||||
"attn_norm": "input_layernorm",
|
||||
"attn_qkv": "self_attention.query_key_value",
|
||||
"attn_output": "self_attention.dense",
|
||||
"output.weight": "lm_head.weight",
|
||||
"output_norm": "transformer.ln_f",
|
||||
},
|
||||
"falcon7b": {
|
||||
"token_embd": "word_embeddings",
|
||||
"blk": "h",
|
||||
"ffn_up": "mlp.dense_h_to_4h",
|
||||
"ffn_down": "mlp.dense_4h_to_h",
|
||||
"attn_norm": "input_layernorm",
|
||||
"attn_qkv": "self_attention.query_key_value",
|
||||
"attn_output": "self_attention.dense",
|
||||
".output.": ".lm_head.",
|
||||
"output_norm": "ln_f",
|
||||
},
|
||||
"falcon40b": {
|
||||
"token_embd": "word_embeddings",
|
||||
"blk": "h",
|
||||
"ffn_up": "mlp.dense_h_to_4h",
|
||||
"ffn_down": "mlp.dense_4h_to_h",
|
||||
".attn_norm.": ".ln_mlp.",
|
||||
"attn_norm_2": "ln_attn",
|
||||
"attn_qkv": "self_attention.query_key_value",
|
||||
"attn_output": "self_attention.dense",
|
||||
".output.": ".lm_head.",
|
||||
"output_norm": "ln_f",
|
||||
},
|
||||
"t5": {
|
||||
"token_embd": "shared",
|
||||
"dec.blk.{bid}.attn_q": "decoder.block.{bid}.layer.0.SelfAttention.q",
|
||||
"dec.blk.{bid}.attn_k": "decoder.block.{bid}.layer.0.SelfAttention.k",
|
||||
"dec.blk.{bid}.attn_v": "decoder.block.{bid}.layer.0.SelfAttention.v",
|
||||
"dec.blk.{bid}.attn_o": "decoder.block.{bid}.layer.0.SelfAttention.o",
|
||||
"dec.blk.{bid}.attn_rel_b": "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias",
|
||||
"dec.blk.{bid}.attn_norm": "decoder.block.{bid}.layer.0.layer_norm",
|
||||
"dec.blk.{bid}.cross_attn_q": "decoder.block.{bid}.layer.1.EncDecAttention.q",
|
||||
"dec.blk.{bid}.cross_attn_k": "decoder.block.{bid}.layer.1.EncDecAttention.k",
|
||||
"dec.blk.{bid}.cross_attn_v": "decoder.block.{bid}.layer.1.EncDecAttention.v",
|
||||
"dec.blk.{bid}.cross_attn_o": "decoder.block.{bid}.layer.1.EncDecAttention.o",
|
||||
"dec.blk.{bid}.cross_attn_norm": "decoder.block.{bid}.layer.1.layer_norm",
|
||||
"dec.blk.{bid}.ffn_gate": "decoder.block.{bid}.layer.2.DenseReluDense.wi_0",
|
||||
"dec.blk.{bid}.ffn_up": "decoder.block.{bid}.layer.2.DenseReluDense.wi_1",
|
||||
"dec.blk.{bid}.ffn_down": "decoder.block.{bid}.layer.2.DenseReluDense.wo",
|
||||
"dec.blk.{bid}.ffn_norm": "decoder.block.{bid}.layer.2.layer_norm",
|
||||
"dec.output_norm": "decoder.final_layer_norm",
|
||||
"enc.blk.{bid}.attn_q": "encoder.block.{bid}.layer.0.SelfAttention.q",
|
||||
"enc.blk.{bid}.attn_k": "encoder.block.{bid}.layer.0.SelfAttention.k",
|
||||
"enc.blk.{bid}.attn_v": "encoder.block.{bid}.layer.0.SelfAttention.v",
|
||||
"enc.blk.{bid}.attn_o": "encoder.block.{bid}.layer.0.SelfAttention.o",
|
||||
"enc.blk.{bid}.attn_rel_b": "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias",
|
||||
"enc.blk.{bid}.attn_norm": "encoder.block.{bid}.layer.0.layer_norm",
|
||||
"enc.blk.{bid}.ffn_gate": "encoder.block.{bid}.layer.1.DenseReluDense.wi_0",
|
||||
"enc.blk.{bid}.ffn_up": "encoder.block.{bid}.layer.1.DenseReluDense.wi_1",
|
||||
"enc.blk.{bid}.ffn_down": "encoder.block.{bid}.layer.1.DenseReluDense.wo",
|
||||
"enc.blk.{bid}.ffn_norm": "encoder.block.{bid}.layer.1.layer_norm",
|
||||
"enc.output_norm": "encoder.final_layer_norm",
|
||||
"output.weight": "lm_head.weight",
|
||||
},
|
||||
"t5encoder": {
|
||||
"token_embd": "shared",
|
||||
"enc.blk.{bid}.attn_q": "encoder.block.{bid}.layer.0.SelfAttention.q",
|
||||
"enc.blk.{bid}.attn_k": "encoder.block.{bid}.layer.0.SelfAttention.k",
|
||||
"enc.blk.{bid}.attn_v": "encoder.block.{bid}.layer.0.SelfAttention.v",
|
||||
"enc.blk.{bid}.attn_o": "encoder.block.{bid}.layer.0.SelfAttention.o",
|
||||
"enc.blk.{bid}.attn_rel_b": "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias",
|
||||
"enc.blk.{bid}.attn_norm": "encoder.block.{bid}.layer.0.layer_norm",
|
||||
"enc.blk.{bid}.ffn_gate": "encoder.block.{bid}.layer.1.DenseReluDense.wi_0",
|
||||
"enc.blk.{bid}.ffn_up": "encoder.block.{bid}.layer.1.DenseReluDense.wi_1",
|
||||
"enc.blk.{bid}.ffn_down": "encoder.block.{bid}.layer.1.DenseReluDense.wo",
|
||||
"enc.blk.{bid}.ffn_norm": "encoder.block.{bid}.layer.1.layer_norm",
|
||||
"enc.output_norm": "encoder.final_layer_norm",
|
||||
},
|
||||
"stablelm": {
|
||||
"token_embd": "model.embed_tokens",
|
||||
"blk": "model.layers",
|
||||
"ffn_up": "mlp.up_proj",
|
||||
"ffn_down": "mlp.down_proj",
|
||||
"ffn_gate": "mlp.gate_proj",
|
||||
"ffn_norm": "post_attention_layernorm",
|
||||
"attn_norm": "input_layernorm",
|
||||
"attn_q": "self_attn.q_proj",
|
||||
"attn_v": "self_attn.v_proj",
|
||||
"attn_k": "self_attn.k_proj",
|
||||
"attn_output": "self_attn.o_proj",
|
||||
"output.weight": "lm_head.weight",
|
||||
"output_norm": "model.norm",
|
||||
},
|
||||
"gpt2": {
|
||||
"token_embd": "transformer.wte",
|
||||
"blk": "transformer.h",
|
||||
"position_embd": "transformer.wpe",
|
||||
"output_norm": "transformer.ln_f",
|
||||
"attn_norm": "ln_1",
|
||||
"attn_qkv": "attn.c_attn",
|
||||
"attn_output.weight": "attn.c_proj.weight",
|
||||
"attn_output.bias": "attn.c_proj.bias",
|
||||
"ffn_norm": "ln_2",
|
||||
"ffn_up": "mlp.c_fc",
|
||||
"ffn_down": "mlp.c_proj",
|
||||
},
|
||||
"starcoder2": {
|
||||
"token_embd": "model.embed_tokens",
|
||||
"blk": "model.layers",
|
||||
"ffn_up": "mlp.c_fc",
|
||||
"ffn_down": "mlp.c_proj",
|
||||
"ffn_norm": "post_attention_layernorm",
|
||||
"attn_norm": "input_layernorm",
|
||||
"attn_q": "self_attn.q_proj",
|
||||
"attn_v": "self_attn.v_proj",
|
||||
"attn_k": "self_attn.k_proj",
|
||||
"attn_output": "self_attn.o_proj",
|
||||
"output.weight": "lm_head.weight",
|
||||
"output_norm": "model.norm",
|
||||
},
|
||||
"mamba": {
|
||||
"token_embd": "backbone.embeddings",
|
||||
"blk": "backbone.layers",
|
||||
"ssm_a": "mixer.A_log",
|
||||
"ssm_conv1d": "mixer.conv1d",
|
||||
"ssm_in": "mixer.in_proj",
|
||||
"ssm_out": "mixer.out_proj",
|
||||
"ssm_x": "mixer.x_proj",
|
||||
"ssm_dt": "mixer.dt_proj",
|
||||
"attn_norm": "norm",
|
||||
"output_norm": "backbone.norm_f",
|
||||
"output.weight": "lm_head.weight",
|
||||
},
|
||||
"nemotron": {
|
||||
"token_embd": "model.embed_tokens",
|
||||
"blk": "model.layers",
|
||||
"ffn_up": "mlp.up_proj",
|
||||
"ffn_down": "mlp.down_proj",
|
||||
"ffn_norm": "post_attention_layernorm",
|
||||
"attn_norm": "input_layernorm",
|
||||
"attn_q": "self_attn.q_proj",
|
||||
"attn_v": "self_attn.v_proj",
|
||||
"attn_k": "self_attn.k_proj",
|
||||
"attn_output": "self_attn.o_proj",
|
||||
"output.weight": "lm_head.weight",
|
||||
"output_norm": "model.norm",
|
||||
},
|
||||
"gemma2": {
|
||||
"token_embd": "model.embed_tokens",
|
||||
"blk": "model.layers",
|
||||
"ffn_up": "mlp.up_proj",
|
||||
"ffn_down": "mlp.down_proj",
|
||||
"ffn_gate": "mlp.gate_proj",
|
||||
"ffn_norm": "pre_feedforward_layernorm",
|
||||
"post_attention_norm": "post_attention_layernorm",
|
||||
"post_ffw_norm": "post_feedforward_layernorm",
|
||||
"attn_norm": "input_layernorm",
|
||||
"attn_q": "self_attn.q_proj",
|
||||
"attn_v": "self_attn.v_proj",
|
||||
"attn_k": "self_attn.k_proj",
|
||||
"attn_output": "self_attn.o_proj",
|
||||
"output_norm": "model.norm",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
GGUF_CONFIG_MAPPING = {
|
||||
"general": {
|
||||
"architecture": "model_type",
|
||||
|
|
|
|||
|
|
@ -22,7 +22,6 @@ from tqdm import tqdm
|
|||
|
||||
from .integrations import (
|
||||
GGUF_CONFIG_MAPPING,
|
||||
GGUF_TENSOR_MAPPING,
|
||||
GGUF_TOKENIZER_MAPPING,
|
||||
_gguf_parse_value,
|
||||
)
|
||||
|
|
@ -47,12 +46,11 @@ GGUF_TO_TRANSFORMERS_MAPPING = {
|
|||
"general": {"file_type": "file_type", "quantization_version": "quantization_version"},
|
||||
},
|
||||
"config": GGUF_CONFIG_MAPPING,
|
||||
"tensors": GGUF_TENSOR_MAPPING,
|
||||
"tokenizer": {"tokenizer": GGUF_TOKENIZER_MAPPING["tokenizer"]},
|
||||
"tokenizer_config": {"tokenizer": GGUF_TOKENIZER_MAPPING["tokenizer_config"]},
|
||||
}
|
||||
|
||||
GGUF_SUPPORTED_ARCHITECTURES = list(GGUF_TO_TRANSFORMERS_MAPPING["tensors"].keys())
|
||||
GGUF_SUPPORTED_ARCHITECTURES = list(GGUF_TO_TRANSFORMERS_MAPPING["config"].keys())
|
||||
|
||||
|
||||
class GGUFTensor(NamedTuple):
|
||||
|
|
@ -121,21 +119,10 @@ class Qwen2MoeTensorProcessor(TensorProcessor):
|
|||
):
|
||||
# Original merge implementation
|
||||
# https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L1994-L2022
|
||||
exp_name = ""
|
||||
if "ffn_gate_exps" in name:
|
||||
exp_name = "gate_proj"
|
||||
elif "ffn_down_exps" in name:
|
||||
exp_name = "down_proj"
|
||||
elif "ffn_up_exps" in name:
|
||||
exp_name = "up_proj"
|
||||
else:
|
||||
raise ValueError(f"Cannot map expert tensor {name} in Qwen2Moe architecture.")
|
||||
for tensor_name in tensor_key_mapping:
|
||||
if tensor_name in name:
|
||||
name = name.replace(tensor_name, tensor_key_mapping[tensor_name])
|
||||
name = tensor_key_mapping[name]
|
||||
w_counter = self.config.get("num_experts", 60)
|
||||
for i in range(0, w_counter):
|
||||
temp_name = name.replace(".weight", f".{i}.{exp_name}.weight")
|
||||
temp_name = name.replace("mlp.experts.", f"mlp.experts.{i}.")
|
||||
exp_weight = weights[i]
|
||||
parsed_parameters["tensors"][temp_name] = torch.from_numpy(np.copy(exp_weight))
|
||||
|
||||
|
|
@ -223,10 +210,6 @@ class MambaTensorProcessor(TensorProcessor):
|
|||
super().__init__(config=config)
|
||||
|
||||
def process(self, weights, name, **kwargs):
|
||||
if "ssm_d" in name and "bias" not in name and "weight" not in name:
|
||||
# ssm_d has conflicts with ssm_dt in name checking
|
||||
# we have to explicitly check that name is exactly ssm_d
|
||||
name = name.replace("ssm_d", "mixer.D")
|
||||
if "ssm_conv1d.weight" in name:
|
||||
# for compatibility tensor ssm_conv1d must be (5120, 1, 4]) dim,
|
||||
# quantized one is (5120, 4)
|
||||
|
|
@ -267,7 +250,84 @@ def read_field(reader, field):
|
|||
return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data]
|
||||
|
||||
|
||||
def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
|
||||
# modified from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/model_loader/loader.py#L1115-L1147
|
||||
def get_gguf_hf_weights_map(
|
||||
hf_model,
|
||||
model_type: Optional[str] = None,
|
||||
num_layers: Optional[int] = None,
|
||||
qual_name: str = "",
|
||||
):
|
||||
"""
|
||||
GGUF uses this naming convention for their tensors from HF checkpoint:
|
||||
`blk.N.BB.weight` and `blk.N.BB.bias`
|
||||
where N signifies the block number of a layer, and BB signifies the
|
||||
attention/mlp layer components.
|
||||
See "Standardized tensor names" in
|
||||
https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details.
|
||||
"""
|
||||
if is_gguf_available() and is_torch_available():
|
||||
from gguf import MODEL_ARCH_NAMES, get_tensor_name_map
|
||||
else:
|
||||
logger.error(
|
||||
"Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see "
|
||||
"https://pytorch.org/ and https://github.com/ggerganov/llama.cpp/tree/master/gguf-py for installation instructions."
|
||||
)
|
||||
raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.")
|
||||
|
||||
model_type = hf_model.config.model_type if model_type is None else model_type
|
||||
num_layers = hf_model.config.num_hidden_layers if num_layers is None else num_layers
|
||||
# hack: ggufs have a different name for cohere
|
||||
if model_type == "cohere":
|
||||
model_type = "command-r"
|
||||
if model_type == "qwen2_moe":
|
||||
model_type = "qwen2moe"
|
||||
arch = None
|
||||
for key, value in MODEL_ARCH_NAMES.items():
|
||||
if value == model_type:
|
||||
arch = key
|
||||
break
|
||||
if arch is None:
|
||||
raise NotImplementedError(
|
||||
f"Unknown gguf model_type: {model_type} in gguf-py. "
|
||||
"This might because you're using an outdated version of gguf-py package, "
|
||||
"you can install `gguf` package from source refer to "
|
||||
"https://github.com/ggerganov/llama.cpp/tree/master/gguf-py#development"
|
||||
)
|
||||
name_map = get_tensor_name_map(arch, num_layers)
|
||||
|
||||
# Use a dummy conversion to get the mapping, because
|
||||
# hf => gguf and gguf => hf mappings are reversed
|
||||
gguf_to_hf_name_map = {}
|
||||
state_dict = hf_model.state_dict()
|
||||
for hf_name in state_dict.keys():
|
||||
# An exception for qwen2moe model, where the expert layers are packed
|
||||
if model_type == "qwen2moe" and "mlp.experts." in hf_name:
|
||||
hf_name = re.sub(r"mlp.experts.\d+.", "mlp.experts.", hf_name)
|
||||
|
||||
name, suffix = hf_name, ""
|
||||
if hf_name.endswith(".weight") or hf_name.endswith(".bias"):
|
||||
name, suffix = hf_name.rsplit(".", 1)
|
||||
suffix = "." + suffix
|
||||
|
||||
gguf_name = name_map.get_name(name)
|
||||
if gguf_name is None:
|
||||
continue
|
||||
|
||||
gguf_to_hf_name_map[gguf_name + suffix] = qual_name + hf_name
|
||||
|
||||
# Some model like Bloom converted from BloomModel instead of BloomForCausalLM
|
||||
# Therefore, we need to check submodule as well to get a correct mapping
|
||||
if named_children := hf_model.named_children():
|
||||
for name, child in named_children:
|
||||
sub_map = get_gguf_hf_weights_map(child, model_type, num_layers, qual_name=f"{qual_name}{name}.")
|
||||
# Ignore the keys that are already in the main map to avoid overwriting
|
||||
sub_map = {k: v for k, v in sub_map.items() if k not in gguf_to_hf_name_map}
|
||||
gguf_to_hf_name_map.update(sub_map)
|
||||
|
||||
return gguf_to_hf_name_map
|
||||
|
||||
|
||||
def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_load=None):
|
||||
"""
|
||||
Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed
|
||||
tokenizer and config attributes.
|
||||
|
|
@ -323,20 +383,8 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
|
|||
parsed_parameters["config"]["use_qkv_bias"] = qkv_bias
|
||||
parsed_parameters["config"]["use_parallel_residual"] = not use_parallel_residual
|
||||
|
||||
model_size = ""
|
||||
# extract the number of params from file name as architectures can differ ;
|
||||
# eg. for falcon : `...falcon-7b-...`
|
||||
if "falcon" in architecture:
|
||||
gguf_file_name = gguf_checkpoint_path.split("/")[-1].lower()
|
||||
m = re.search(r"-\d+b-", gguf_file_name) # regex to catch `-7b-`
|
||||
if m is None:
|
||||
raise ValueError(
|
||||
f"From file name, cannot determine the number of parameters for {architecture} architecture"
|
||||
)
|
||||
model_size = m.group().strip("-") # only keeps `7b`
|
||||
|
||||
if architecture + model_size not in GGUF_SUPPORTED_ARCHITECTURES:
|
||||
raise ValueError(f"Architecture {architecture + model_size} not supported")
|
||||
if architecture not in GGUF_SUPPORTED_ARCHITECTURES:
|
||||
raise ValueError(f"GGUF model with architecture {architecture} is not supported yet.")
|
||||
|
||||
# Handle tie_word_embeddings, if lm_head.weight is not present in tensors,
|
||||
# tie_word_embeddings is true otherwise false
|
||||
|
|
@ -388,7 +436,9 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
|
|||
)
|
||||
|
||||
if return_tensors:
|
||||
tensor_key_mapping = GGUF_TO_TRANSFORMERS_MAPPING["tensors"][architecture + model_size]
|
||||
parsed_parameters["tensors"] = {}
|
||||
|
||||
tensor_key_mapping = get_gguf_hf_weights_map(model_to_load)
|
||||
config = parsed_parameters.get("config", {})
|
||||
|
||||
ProcessorClass = TENSOR_PROCESSORS.get(architecture, TensorProcessor)
|
||||
|
|
@ -407,16 +457,12 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
|
|||
|
||||
weights = result.weights
|
||||
name = result.name
|
||||
bid = result.metadata.get("bid")
|
||||
|
||||
if name is None:
|
||||
if name not in tensor_key_mapping:
|
||||
continue
|
||||
|
||||
for tensor_name in tensor_key_mapping:
|
||||
if tensor_name.format(bid=bid) in name:
|
||||
name = name.replace(tensor_name.format(bid=bid), tensor_key_mapping[tensor_name].format(bid=bid))
|
||||
name = tensor_key_mapping[name]
|
||||
|
||||
# Use copy to avoid errors with numpy and pytorch
|
||||
parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights))
|
||||
|
||||
if len(reader_keys) > 0:
|
||||
|
|
|
|||
|
|
@ -3917,7 +3917,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
|||
|
||||
gguf_path = cached_file(pretrained_model_name_or_path, gguf_file, **cached_file_kwargs)
|
||||
|
||||
state_dict = load_gguf_checkpoint(gguf_path, return_tensors=True)["tensors"]
|
||||
# we need a dummy model to help rename state_dict
|
||||
with torch.device("meta"):
|
||||
dummy_model = cls(config)
|
||||
state_dict = load_gguf_checkpoint(gguf_path, return_tensors=True, model_to_load=dummy_model)["tensors"]
|
||||
|
||||
resolved_archive_file = None
|
||||
is_sharded = False
|
||||
|
|
|
|||
Loading…
Reference in a new issue