Fix llama.covert_onnx to make it runnable in CI (#19372)

### Description 1. make parity_check use local model to avoid using hf token 2. del the model didn't work because it tried to del the object define out of the function scope. So it caused out of memory in A10. 3. In fact, 16G GPU memory (one T4) is enough. But the conversion process always be killed in T4 and it works on A10/24G. Standard_NC4as_T4_v3 has 28G CPU memory Standard_NV36ads_A10_v5 has 440G memory. It looks that the model conversion needs very huge memory. ### Motivation and Context Last time, I came across some issues in convert_to_onnx.py so I use the onnx model in https://github.com/microsoft/Llama-2-Onnx for testing. Now, these issues could be fixed. So I use onnx model generated by this repo and the CI can cover the model conversion.
2026-07-07 04:39:07 +00:00 · 2024-02-05 07:26:24 +08:00 · 2024-02-05 07:26:24 +08:00 · 435e19953e
commit 435e19953e
parent 0cba56e0a0
5 changed files with 84 additions and 56 deletions
--- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@ -781,6 +781,13 @@ def get_args():
        action="store_true",
        help="Avoid exporting model, only apply quantizations and optimizations to existing model exported from optimum.",
    )
+
+    parser.add_argument(
+        "--small_gpu",
+        action="store_true",
+        help="Load the llama in GPU every time for parity_check if it's running in a machine which GPU memory < 36GB.",
+    )
+
    parser.set_defaults(optimize_optimum=False)

    args = parser.parse_args()
@ -788,9 +795,7 @@ def get_args():


 def main():
-    if version.parse(torch.__version__) < version.parse("2.2.0") and "2.2.0.dev" not in torch.__version__:
-        # Second predicate is for comparing nightly (ex: 2.2.0.dev20230920 vs 2.2.0) since first predicate is false
-        # in that scenario. It can be removed when torch v2.2.0 is released in stable.
+    if version.parse(torch.__version__) < version.parse("2.2.0"):
        logger.error(f"Detected PyTorch version {torch.__version__}. Please upgrade and use v2.2.0 or newer.")
        return

@ -1021,7 +1026,11 @@ def main():
            args.precision,
            "--cache_dir",
            args.cache_dir,
+            "--torch_model_directory",
+            args.input,
        ]
+        if args.small_gpu:
+            parity_cmd.append("--small_gpu")
        if "with_past" in filename:
            parity_cmd.append("--use_past_kv")
        if "merged" in filename:
@ -1030,7 +1039,7 @@ def main():
            parity_cmd.append("--use_gqa")

        try:
-            logger.debug(f"check parity with cmd: {parity_cmd}")
+            logger.info(f"check parity with cmd: {parity_cmd}")
            parity_check(parity_cmd)
        except Exception as e:
            logger.warning(f"An error occurred while verifying parity: {e}", exc_info=True)
--- a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
@ -17,7 +17,7 @@ from llama_inputs import (
    get_sample_with_past_kv_inputs,
 )
 from llama_torch import setup_torch_model
-from transformers import AutoConfig, AutoModelForCausalLM
+from transformers import AutoConfig

 import onnxruntime as ort

@ -67,20 +67,39 @@ def get_inputs(args: argparse.Namespace, config: AutoConfig):


 def verify_parity(
-    args: argparse.Namespace, config: AutoConfig, pt_model: AutoModelForCausalLM, kv_cache_ortvalues: dict
+    args: argparse.Namespace,
+    location: str,
+    use_auth_token: bool,
+    kv_cache_ortvalues: dict,
+    pytorch_model: None | torch.nn.Module = None,
+    config: None | AutoConfig = None,
 ):
+    # If it's running in a machine which GPU memory < 36GB, it should unload the llama in GPU in time and free the GPU memory for ORT.
+    py_model = pytorch_model
+    if py_model is None:
+        config, py_model = setup_torch_model(
+            args,
+            location,
+            use_auth_token,
+            torch_dtype=(torch.float16 if args.use_fp16 else torch.float32),
+            device=args.device,
+        )
+
    inputs = get_inputs(args, config)

    # Run inference with PyTorch
    if args.execution_provider != "cpu":
        torch.cuda.synchronize()
    start_time = time.time()
-    pt_outputs = pt_model(**inputs).logits.detach().cpu().numpy()
+    pt_outputs = py_model(**inputs).logits.detach().cpu().numpy()
    if args.execution_provider != "cpu":
        torch.cuda.synchronize()
    end_time = time.time()
    logger.info(f"PyTorch took {end_time - start_time} s")
-    del pt_model
+
+    if args.small_gpu and py_model is not None:
+        del py_model
+        torch.cuda.empty_cache()

    # Run inference with ORT
    past_sequence_length, _, max_sequence_length = get_sequence_lengths(args)
@ -222,6 +241,13 @@ def get_args(argv: list[str]):
        help="model cache dir to override default HF cache dir to avoid overflood the /home dir",
    )

+    # The argument is used for CI mainly, because the CI machine has 24G GPU memory at most.
+    parser.add_argument(
+        "--small_gpu",
+        action="store_true",
+        help="Load the llama in GPU every time for parity_check if it's running in a machine which GPU memory < 36GB. ",
+    )
+
    args = parser.parse_args() if argv == [] else parser.parse_args(argv)

    # Use FP32 precision for FP32, INT8, INT4 CPU models, use FP16 precision for FP16 and INT4 GPU models
@ -247,25 +273,29 @@ def main(argv: list[str] = []):  # noqa: B006
    use_auth_token = args.torch_model_directory == os.path.join(".")
    location = args.model_name if use_auth_token else args.torch_model_directory

-    config, llama = setup_torch_model(
-        args,
-        location,
-        use_auth_token,
-        torch_dtype=(torch.float16 if args.use_fp16 else torch.float32),
-        device=args.device,
-    )
-
    kv_cache_ortvalues = {}
    if not args.merged:
-        verify_parity(args, config, llama, kv_cache_ortvalues)
+        verify_parity(args, location, use_auth_token, kv_cache_ortvalues)
    else:
-        # Verify prompt generation in merged model (decoder_model.onnx)
+        config = llama = None
+        if not args.small_gpu:
+            config, llama = setup_torch_model(
+                args,
+                location,
+                use_auth_token,
+                torch_dtype=(torch.float16 if args.use_fp16 else torch.float32),
+                device=args.device,
+            )
+
+        # Verify prompt processing in merged model (decoder_model.onnx)
        args.use_past_kv = False
-        kv_cache_ortvalues = verify_parity(args, config, llama, kv_cache_ortvalues)
+        kv_cache_ortvalues = verify_parity(
+            args, location, use_auth_token, kv_cache_ortvalues, pytorch_model=llama, config=config
+        )

        # Verify token generation in merged model (decoder_with_past_model.onnx)
        args.use_past_kv = True
-        verify_parity(args, config, llama, kv_cache_ortvalues)
+        verify_parity(args, location, use_auth_token, kv_cache_ortvalues, pytorch_model=llama, config=config)


 if __name__ == "__main__":
--- a/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt
+++ b/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt
@ -1,4 +1,4 @@
 -r requirements.txt
-# Please manually install torch>=2.2.0.dev20230920 with CUDA enabled for the CUDA version installed in your system.
+# Please manually install torch>=2.2.0 with CUDA enabled for the CUDA version installed in your system.
 # Instructions can be found here: https://pytorch.org/get-started/locally/
-onnxruntime-gpu>=1.16.2
+onnxruntime-gpu>=1.16.2
--- a/onnxruntime/python/tools/transformers/models/llama/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/llama/requirements.txt
@ -1,6 +1,6 @@
 optimum>=1.14.1
 transformers>=4.33.2
-torch>=2.2.0.dev20230920
+torch>=2.2.0
 onnx>=1.14.0
 datasets>=2.8.0
-protobuf==3.20.2
+protobuf==3.20.2
--- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@ -268,7 +268,7 @@ stages:
      skipComponentGovernanceDetection: true
    workspace:
      clean: all
-    pool: onnxruntime-Linux-GPU-T4
+    pool: Onnxruntime-Linux-A10-24G
    steps:
    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
      displayName: 'Clean Agent Directories'
@ -278,10 +278,6 @@ stages:
      clean: true
      submodules: none

-    - checkout: LLaMa2Onnx
-      clean: true
-      submodules: none
-
    - template: templates/flex-downloadPipelineArtifact.yml
      parameters:
        StepName: 'Download Onnxruntime Artifact'
@ -290,47 +286,40 @@ stages:
        SpecificArtifact: ${{ parameters.specificArtifact }}
        BuildId: ${{ parameters.BuildId }}

-    - task: DownloadPackage@1
-      displayName: 'Download Llama2 model'
-      inputs:
-        packageType: upack
-        feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
-        version: 1.0.0
-        definition: '772ebce3-7e06-46d5-b3cc-82040ec4b2ce'
-        downloadPath: $(Agent.TempDirectory)/llama2_onnx_ft16
-
    - template: templates/get-docker-image-steps.yml
      parameters:
-        Dockerfile: onnxruntime/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
-        Context: onnxruntime/tools/ci_build/github/linux/docker/
-        ScriptName: onnxruntime/tools/ci_build/get_docker_image.py
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
+        Context: tools/ci_build/github/linux/docker/
+        ScriptName: tools/ci_build/get_docker_image.py
        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
        Repository: onnxruntimeubi8packagestest
        UpdateDepsTxt: false

+    - task: DownloadPackage@1
+      displayName: 'Download Meta Llama2 model'
+      inputs:
+        packageType: upack
+        feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
+        version: 1.0.0
+        definition: '6fe0c4ed-9d0e-4d66-94cc-fb6a111d02a5'
+        downloadPath: $(Agent.TempDirectory)/meta_llama2_7b_hf
+
    - script: |
-        docker run --rm --gpus all -v $(Build.SourcesDirectory)/Llama-2-Onnx:/workspace \
+        docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
           -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
-           -v $(Agent.TempDirectory)/llama2_onnx_ft16:/models \
+           -v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \
           onnxruntimeubi8packagestest \
            bash -c "
              set -ex; \
+              pushd /workspace/onnxruntime/python/tools/transformers/ ; \
              python3 -m pip install --upgrade pip ; \
+              pushd models/llama ; \
+              python3 -m pip install -r requirements-cuda.txt ; \
+              popd ; \
              python3 -m pip install /ort-artifact/*.whl ; \
              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
-              python3 -m pip install sentencepiece ; \
-              pushd /workspace ; \
-              python3 MinimumExample/Example_ONNX_LlamaV2.py --onnx_file /models/ONNX/LlamaV2_7B_FT_float16.onnx \
-                --embedding_file /models/embeddings.pth --tokenizer_path tokenizer.model --prompt 'What is the lightest element?' > /workspace/answer.txt ; \
+              python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --input /meta-llama2 --small_gpu ;\
              popd ; \
            "
-      displayName: 'Run Llama2 demo'
+      displayName: 'Run Llama2 to Onnx F16 and parity Test'
      workingDirectory: $(Build.SourcesDirectory)
-
-    - script: |
-        set -ex
-        real=$(cat $(Build.SourcesDirectory)/Llama-2-Onnx/answer.txt)
-        trim_actual=$(tr -dc '[[:print:]]' <<< "$real")
-        expected="The lightest element is hydrogen. Hydrogen is the lightest element on the periodic table, with an atomic mass of 1.00794 u (unified atomic mass units)."
-        [ "$expected" == "$trim_actual" ] && exit 0 || exit 1
-      displayName: 'Check result'