Add Whisper model in CI (#19604)

### Description Add Whisper Conversion and E2E into Big Models pipeline ### Motivation and Context  --------- Co-authored-by: Your Name <your@email.com> Co-authored-by: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
2026-07-20 19:12:24 +00:00 · 2024-02-25 14:04:22 +08:00 · 2024-02-25 14:04:22 +08:00 · 0fcc6fb760
commit 0fcc6fb760
parent c980149c85
7 changed files with 115 additions and 8 deletions
--- a/onnxruntime/python/tools/transformers/benchmark_helper.py
+++ b/onnxruntime/python/tools/transformers/benchmark_helper.py
@ -589,7 +589,7 @@ def measure_memory(is_gpu, func, monitor_type="cuda", start_memory=None):
            if max_usage is None:
                return None

-            print(f"GPU memory usage: before={memory_before_test}  peak={max_usage}")
+            logger.info(f"GPU memory usage: before={memory_before_test}  peak={max_usage}")
            if len(memory_before_test) >= 1 and len(max_usage) >= 1 and len(memory_before_test) == len(max_usage):
                # When there are multiple GPUs, we will check the one with maximum usage.
                max_used = 0
@ -620,7 +620,7 @@ def measure_memory(is_gpu, func, monitor_type="cuda", start_memory=None):
            monitor.keep_measuring = False
            max_usage = mem_thread.result()

-        print(f"CPU memory usage: before={memory_before_test:.1f} MB, peak={max_usage:.1f} MB")
+        logger.info(f"CPU memory usage: before={memory_before_test:.1f} MB, peak={max_usage:.1f} MB")
        return max_usage - memory_before_test


--- a/onnxruntime/python/tools/transformers/models/whisper/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/benchmark.py
@ -410,7 +410,8 @@ def run_ort_inference(args, inputs, model):
        actual_output = handle_output(ort_outputs[0][0])
        logger.info(f"Generated token length: {len(actual_output)} tokens")
        transcription = args.processor.batch_decode(ort_outputs[0], skip_special_tokens=True)[0]
-        logger.info(f"Transcription: {transcription}")
+        # print to stdout as the output for comparison
+        print(f"{transcription}")

    measure_fn(args, generate_fn, ort_inputs)

--- a/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
@ -8,4 +8,7 @@ librosa
 optimum
 onnxruntime-extensions>=0.9.0
 protobuf==3.20.2
-numpy==1.23.3
+numpy==1.23.3
+onnx>=1.15.0
+psutil
+py3nvml
--- a/onnxruntime/python/tools/transformers/models/whisper/test/1272-141231-0002.mp3
+++ b/onnxruntime/python/tools/transformers/models/whisper/test/1272-141231-0002.mp3
--- a/onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt
+++ b/onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt
@ -0,0 +1 @@
+ the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about
--- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@ -314,12 +314,111 @@ stages:
              pushd /workspace/onnxruntime/python/tools/transformers/ ; \
              python3 -m pip install --upgrade pip ; \
              pushd models/llama ; \
-              python3 -m pip install -r requirements-cuda.txt ; \
+              python3 -m pip install -r requirements.txt ; \
              popd ; \
              python3 -m pip install /ort-artifact/*.whl ; \
+              python3 -m pip uninstall -y torch ; \
              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
              python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --input /meta-llama2 --small_gpu ;\
              popd ; \
            "
      displayName: 'Run Llama2 to Onnx F16 and parity Test'
      workingDirectory: $(Build.SourcesDirectory)
+
+- stage: Whisper_ONNX
+  dependsOn:
+  - Build_Onnxruntime_Cuda
+  jobs:
+  - job: Whisper_ONNX
+    variables:
+      skipComponentGovernanceDetection: true
+    workspace:
+      clean: all
+    pool: Onnxruntime-Linux-A10-24G
+    steps:
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
+
+    - checkout: self
+      clean: true
+      submodules: none
+
+    - template: templates/flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download Onnxruntime Artifact'
+        ArtifactName: 'drop-ort-linux-gpu'
+        TargetPath: '$(Build.BinariesDirectory)/ort-artifact/'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
+
+    - template: templates/get-docker-image-steps.yml
+      parameters:
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+        Context: tools/ci_build/github/linux/docker/
+        ScriptName: tools/ci_build/get_docker_image.py
+        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
+        Repository: onnxruntimepackagestest
+        UpdateDepsTxt: false
+
+    - task: DownloadPackage@1
+      # The model data in artifact is downloaded from openai/whisper-large-v3 in huggingface model hub
+      # In order to save size, removed .git directory and pickled files, and keep the safetensors model files
+      displayName: 'Download Whisper Model'
+      inputs:
+        packageType: upack
+        feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
+        version: 1.0.0
+        definition: 'b583ce7c-1a8f-4099-ae28-5d5f56c478b1'
+        downloadPath: $(Agent.TempDirectory)/whisper_large_v3
+
+    - script: |
+        docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
+           -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
+           -v $(Agent.TempDirectory)/whisper_large_v3:/whisper_large_v3 \
+           onnxruntimepackagestest \
+            bash -c '
+              set -ex; \
+              pushd /workspace/onnxruntime/python/tools/transformers/ ; \
+              python3 -m pip install --upgrade pip ; \
+              pushd models/whisper ; \
+              python3 -m pip install -r requirements.txt ; \
+              popd ; \
+              python3 -m pip install /ort-artifact/*.whl ; \
+              python3 -m pip uninstall -y torch ; \
+              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
+              python3 -m models.whisper.convert_to_onnx -m /whisper_large_v3 --output whisperlargev3 --use_external_data_format ; \
+              popd ; \
+            '
+      displayName: 'Convert Whisper Model'
+      workingDirectory: $(Build.SourcesDirectory)
+
+    - script: |
+        docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
+           -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
+           -v $(Agent.TempDirectory)/whisper_large_v3:/whisper_large_v3 \
+           onnxruntimepackagestest \
+            bash -c '
+              set -ex; \
+              pushd /workspace/onnxruntime/python/tools/transformers/ ; \
+              python3 -m pip install --upgrade pip ; \
+              pushd models/whisper ; \
+              python3 -m pip install -r requirements.txt ; \
+              popd ; \
+              python3 -m pip install /ort-artifact/*.whl ; \
+              python3 -m pip uninstall -y torch ; \
+              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
+              ls whisperlargev3; \
+              python3 -m models.whisper.benchmark \
+                  --benchmark-type ort \
+                  --audio-path models/whisper/test/1272-141231-0002.mp3 \
+                  --model-name openai/whisper-large-v3 \
+                  --ort-model-path /workspace/onnxruntime/python/tools/transformers/whisperlargev3/whisper_large_v3_beamsearch.onnx \
+                  --precision fp32 \
+                  --device cuda > ort_output.txt ; \
+              cat ort_output.txt ; \
+              diff ort_output.txt /workspace/onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt && exit 0 || exit 1
+              popd ; \
+            '
+      displayName: 'Test Whisper ONNX Model'
+      workingDirectory: $(Build.SourcesDirectory)
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
@ -16,15 +16,18 @@ ENV DEBIAN_FRONTEND=noninteractive
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH_ARG}:${LD_LIBRARY_PATH}

 RUN apt-get update &&\
-    apt-get install -y git bash wget
+    apt-get install -y git bash wget diffutils

 # Install python3
 RUN apt-get install -y --no-install-recommends \
    python3 \
    python3-pip \
    python3-dev \
-    python3-wheel 
-   
+    python3-wheel
+
+# Install ffmpeg, which couldn't be installed in UBI8
+# https://stackoverflow.com/questions/73597789/how-to-install-ffmpeg-on-ubi-docker-images
+RUN apt-get install -y --no-install-recommends ffmpeg

 RUN pip install --upgrade pip
				`@ -0,0 +1 @@`
				`the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about`