diff --git a/src/transformers/models/llama/quantize_fp8_llama.py b/src/transformers/models/llama/quantize_fp8_llama.py new file mode 100644 index 000000000..3e319622d --- /dev/null +++ b/src/transformers/models/llama/quantize_fp8_llama.py @@ -0,0 +1,36 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from transformers import FbgemmFp8Config, LlamaForCausalLM + + +modules_to_not_convert = [] + +# As defined by Meta, we don't quantize the first and last layers as well as the lm_head. Also, we don't quantize the self_attn layers. +modules_to_not_convert.append("model.layers.0") +modules_to_not_convert.append("model.layers.125") +modules_to_not_convert.append("lm_head") +for layer_i in range(1, 125): + modules_to_not_convert.append(f"model.layers.{layer_i}.self_attn") + +quantization_config = FbgemmFp8Config(modules_to_not_convert=modules_to_not_convert) +model_name = "meta-llama/Llama-3.1-405B" + +model = LlamaForCausalLM.from_pretrained( + model_name, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config +) + +model.save_pretrained(f"{model_name}-FP8")