onnxruntime/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml

##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
### please do rerun set-trigger-rules.py ###
trigger:
  branches:
    include:
    - main
    - rel-*
  paths:
    exclude:
    - docs/**
    - README.md
    - CONTRIBUTING.md
    - BUILD.md
    - 'js/web'
    - 'onnxruntime/core/providers/js'
pr:
  branches:
    include:
    - main
    - rel-*
  paths:
    exclude:
    - docs/**
    - README.md
    - CONTRIBUTING.md
    - BUILD.md
    - 'js/web'
    - 'onnxruntime/core/providers/js'
#### end trigger ####

# reference: https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
parameters:
- name: specificArtifact
  displayName: Use Specific Artifact
  type: boolean
  default: false
- name: BuildId
  displayName: Specific Artifact's RunId
  type: number
  default: 0

variables:
  - name: docker_base_image
    value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
  - name: linux_trt_version
    value: 10.3.0.26-1.cuda11.8
  - name: Repository
    value: 'onnxruntimecuda11manylinuxbuild'

stages:
- stage: Build_Onnxruntime_Cuda
  jobs:
  - job: Linux_Build
    timeoutInMinutes: 120
    variables:
      skipComponentGovernanceDetection: true
      CCACHE_DIR: $(Pipeline.Workspace)/ccache
    workspace:
      clean: all
    pool: onnxruntime-Ubuntu2204-AMD-CPU
    steps:

    - checkout: self
      clean: true
      submodules: none

    # same as linux-gpu-ci-pipeline.yml
    - template: templates/get-docker-image-steps.yml
      parameters:
        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
        Context: tools/ci_build/github/linux/docker
        DockerBuildArgs: "
        --network=host
        --build-arg BASEIMAGE=$(docker_base_image)
        --build-arg TRT_VERSION=$(linux_trt_version)
        --build-arg BUILD_UID=$( id -u )
        "
        Repository: $(Repository)

    - task: CmdLine@2
      inputs:
        script: |
          mkdir -p $HOME/.onnx
          docker run -e --rm \
            --volume /data/onnx:/data/onnx:ro \
            --volume $(Build.SourcesDirectory):/onnxruntime_src \
            --volume $(Build.BinariesDirectory):/build \
            --volume /data/models:/build/models:ro \
            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
            -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
            -e NIGHTLY_BUILD \
            -e BUILD_BUILDNUMBER \
            $(Repository) \
            /bin/bash -c '
              set -ex; \
              PATH=/opt/python/cp310-cp310/bin:$PATH  /opt/python/cp310-cp310/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
                --build_dir /build --cmake_generator Ninja \
                --config Release --update --build \
                --skip_submodule_sync \
                --build_shared_lib \
                --parallel --use_vcpkg \
                --build_wheel \
                --enable_onnx_tests --use_cuda --cuda_version=11.8 --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \
                --enable_cuda_profiling \
                --enable_pybind --build_java \
                --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75;86" '

        workingDirectory: $(Build.SourcesDirectory)

    - task: CmdLine@2
      inputs:
        script: |
          rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11
          rm -f $(Build.BinariesDirectory)/Release/models
          find $(Build.BinariesDirectory)/Release/_deps -mindepth 1 ! -regex '^$(Build.BinariesDirectory)/Release/_deps/onnx-src\(/.*\)?' -delete
          cd $(Build.BinariesDirectory)/Release
          find -executable -type f > $(Build.BinariesDirectory)/Release/perms.txt

    - script: |
        set -ex
        mkdir -p $(Agent.TempDirectory)/ort
        cp $(Build.BinariesDirectory)/Release/dist/*.whl $(Agent.TempDirectory)/ort/
      displayName: 'Copy Wheels'

    - task: PublishPipelineArtifact@0
      displayName: 'Publish Pipeline Artifact'
      inputs:
        artifactName: 'drop-ort-linux-gpu'
        targetPath: '$(Agent.TempDirectory)/ort'

    - template: templates/explicitly-defined-final-tasks.yml

- stage: Stable_Diffusion
  dependsOn:
  - Build_Onnxruntime_Cuda
  jobs:
  - job: Stable_Diffusion
    variables:
      skipComponentGovernanceDetection: true
      CLIP_MODEL_CACHE: $(Agent.TempDirectory)/clip_cache
      STABLE_DIFFUSION_MODEL_CACHE: $(Agent.TempDirectory)/stablediffusion_cache
      GenerateImage_DIR: $(Agent.TempDirectory)/images
      hitAnother: 'False'
    workspace:
      clean: all
    pool: onnxruntime-Linux-GPU-A10-12G
    steps:
    - checkout: self
      clean: true
      submodules: none

    - template: templates/flex-downloadPipelineArtifact.yml
      parameters:
        StepName: 'Download Onnxruntime Artifact'
        ArtifactName: 'drop-ort-linux-gpu'
        TargetPath: '$(Build.BinariesDirectory)/Release'
        SpecificArtifact: ${{ parameters.specificArtifact }}
        BuildId: ${{ parameters.BuildId }}
    - template: templates/get-docker-image-steps.yml
      parameters:
        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv
        Context: tools/ci_build/github/linux/docker/
        ScriptName: tools/ci_build/get_docker_image.py
        DockerBuildArgs: "
        --build-arg BUILD_UID=$( id -u )
        "
        Repository: onnxruntimeubuntupackagestest_cuda11
        UseImageCacheContainerRegistry: false

    - task: Cache@2
      inputs:
        key: stable_diffusion | $(Build.SourcesDirectory)/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
        restoreKeys: |
          stable_diffusion | $(Build.SourcesDirectory)/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
          stable_diffusion
        path: $(STABLE_DIFFUSION_MODEL_CACHE)
      displayName: Cache stable diffusion model

    - script: |
        mkdir -p $(GenerateImage_DIR)
        docker run --rm --gpus all -v $PWD:/workspace \
          -v $(Build.BinariesDirectory)/Release:/Release \
          -v $(STABLE_DIFFUSION_MODEL_CACHE):/model_cache:rw \
          -v $(GenerateImage_DIR):/images:rw \
          onnxruntimeubuntupackagestest_cuda11 \
          bash -c ' \
            set -ex; \
            python3 --version; \
            python3 -m pip install --upgrade pip; \
            python3 -m pip install /Release/*.whl; \
            pushd /workspace/onnxruntime/python/tools/transformers/models/stable_diffusion; \
            python3 -m pip install -r requirements/cuda11/requirements.txt; \
            python3 -m pip install numpy==1.22.2; \
            python3 -m pip install --upgrade polygraphy onnx-graphsurgeon ; \
            echo Generate an image guided by a text prompt; \
            python3 demo_txt2img.py --framework-model-dir /model_cache --seed 1 --deterministic "astronaut riding a horse on mars" ; \
            find $(pwd)/ORT_CUDA -name "*.png" -exec cp {} /images/ \; ; \
            popd ; \
            '
      displayName: 'Run stable diffusion demo'
      workingDirectory: $(Build.SourcesDirectory)

    # For verification we will check the generated image looks .
    # Because the artifact isn't used by other jobs, we set the artifact name to a varabile value.
    # So the job can be rerun without the exception that artifact has been published.
    - task: PublishPipelineArtifact@0
      displayName: 'Publish Generated Image Artifact'
      inputs:
          artifactName: Generated-Image-$(System.JobAttempt)
          targetPath: '$(GenerateImage_DIR)'

    - task: Cache@2
      inputs:
        key: clip_model | $(Build.SourcesDirectory)/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py
        restoreKeys: |
          clip_model | $(Build.SourcesDirectory)/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py
          clip_model
        path: $(CLIP_MODEL_CACHE)
      displayName: Cache clip model

    - script: |
        docker run --rm --gpus all -v $PWD:/workspace \
          -v $(CLIP_MODEL_CACHE):/model_cache:rw  \
          onnxruntimeubuntupackagestest_cuda11 \
          bash -c '
            set -x; \
            python3 --version; \
            python3 -m pip install --upgrade pip; \
            pushd /workspace/onnxruntime/python/tools/transformers/models/stable_diffusion/; \
            image2=$(find $(pwd) -name "astronaut_riding_a_h*.png") ; \
            pushd test; \
            python3 -m pip install -r requirements.txt; \
            echo check demo_txt2image.py generate image; \
            python3 -u check_image.py --image1 astronaut_riding_error.png --image2 $image2 --cache_dir /model_cache --negative; \
            if [ $? -ne 0 ]; then echo "Hit an unexpected image"; exit 1; fi; \
            popd ; \
            popd ; \
            ' || ( echo "##vso[task.setvariable variable=hitAnother;]True" && exit 1 )
      displayName: 'Check if the generated image is wierd'
      workingDirectory: $(Build.SourcesDirectory)
      # If the generate image hit another test image, make the job status as warning
      continueOnError: true

    - bash: |
        echo "You can use variables: $(hitAnother)"

    # The step will execute if the gereneate image doesn't hit another test image
    - script: |
        docker run --rm --gpus all -v $PWD:/workspace \
          -v $(CLIP_MODEL_CACHE):/model_cache:rw  \
          onnxruntimeubuntupackagestest_cuda11 \
          bash -c '
            set -ex; \
            python3 --version; \
            python3 -m pip install --upgrade pip; \
            pushd /workspace/onnxruntime/python/tools/transformers/models/stable_diffusion/; \
            image2=$(find $(pwd) -name "astronaut_riding_a_h*.png") ; \
            pushd test; \
            python3 -m pip install numpy==1.22.2; \
            python3 -m pip install -r requirements.txt; \
            echo check demo_txt2image.py generate image; \
            python3 -u check_image.py --image1 astronaut_riding_txt2image-DDIM-50.png --image2 $image2 --cache_dir /model_cache ; \
            popd ; \
            popd ; \
            '
      displayName: 'Check the generated image'
      workingDirectory: $(Build.SourcesDirectory)
      condition: ne(variables.hitAnother, 'True')

- stage: Llama2_7B_ONNX
  dependsOn:
  - Build_Onnxruntime_Cuda
  condition: or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-'), eq(variables['UseA100'], '1'))
  jobs:
  - job: Llama2_7B_ONNX
    timeoutInMinutes: 120
    variables:
      skipComponentGovernanceDetection: true
    workspace:
      clean: all
    pool:
      name: Onnxruntime-Linux-GPU-A100-WUS3
      demands:
      - WorkFolder -equals /mnt/storage/
    steps:

    - checkout: self
      clean: true
      submodules: none

    - template: templates/flex-downloadPipelineArtifact.yml
      parameters:
        StepName: 'Download Onnxruntime Artifact'
        ArtifactName: 'drop-ort-linux-gpu'
        TargetPath: '$(Build.BinariesDirectory)/ort-artifact/'
        SpecificArtifact: ${{ parameters.specificArtifact }}
        BuildId: ${{ parameters.BuildId }}

    - template: templates/get-docker-image-steps.yml
      parameters:
        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
        Context: tools/ci_build/github/linux/docker/
        ScriptName: tools/ci_build/get_docker_image.py
        DockerBuildArgs: "
        --build-arg BUILD_UID=$( id -u )
        --build-arg BASEIMAGE=${{ variables.docker_base_image }}
        --build-arg TRT_VERSION=${{ variables.linux_trt_version }}
        "
        Repository: onnxruntimeubi8packagestest_torch
        UseImageCacheContainerRegistry: false

    - task: DownloadPackage@1
      displayName: 'Download Meta Llama2 model'
      inputs:
        packageType: upack
        feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
        version: 1.0.0
        definition: '6fe0c4ed-9d0e-4d66-94cc-fb6a111d02a5'
        downloadPath: $(Agent.TempDirectory)/meta_llama2_7b_hf

    - script: |
        docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
           -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
           -v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \
           onnxruntimeubi8packagestest_torch \
            bash -c "
              set -ex; \
              pushd /workspace/onnxruntime/python/tools/transformers/ ; \
              python3 -m pip install --upgrade pip ; \
              pushd models/llama ; \
              python3 -m pip install -r requirements.txt ; \
              popd ; \
              python3 -m pip install /ort-artifact/*.whl ; \
              python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --small_gp;\
              ls -l llama2-7b-fp16; \
              du -sh llama2-7b-fp16; \
              popd ; \
            "
      displayName: 'Run Llama2 to Onnx F16 and parity Test'
      workingDirectory: $(Build.SourcesDirectory)

    - script: |
        docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
           -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
           -v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \
           onnxruntimeubi8packagestest_torch \
            bash -c "
              set -ex; \
              pushd /workspace/onnxruntime/python/tools/transformers/ ; \
              python3 -m pip install --upgrade pip ; \
              pushd models/llama ; \
              python3 -m pip install -r requirements.txt ; \
              popd ; \
              python3 -m pip install /ort-artifact/*.whl ; \
              python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-fp32-gpu --precision fp32 --execution_provider cuda;\
              ls -l llama2-7b-fp32-gpu; \
              du -sh llama2-7b-fp32-gpu; \
              popd ; \
            "
      displayName: 'Run Llama2 to Onnx fp32 and parity Test'
      workingDirectory: $(Build.SourcesDirectory)

    - script: |
        docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
           -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
           -v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \
           onnxruntimeubi8packagestest_torch \
            bash -c "
              set -ex; \
              pushd /workspace/onnxruntime/python/tools/transformers/ ; \
              python3 -m pip install --upgrade pip ; \
              pushd models/llama ; \
              python3 -m pip install -r requirements.txt ; \
              popd ; \
              python3 -m pip install /ort-artifact/*.whl ; \
              python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-int4-gpu  --precision int4 --execution_provider cuda --use_gqa;\
              ls -l llama2-7b-int4-gpu; \
              du -sh llama2-7b-int4-gpu; \
              popd ; \
            "
      displayName: 'Run Llama2 to Onnx INT4 and parity Test'
      workingDirectory: $(Build.SourcesDirectory)

- stage: Whisper_ONNX
  dependsOn:
  - Build_Onnxruntime_Cuda
  jobs:
  - job: Whisper_ONNX
    variables:
      skipComponentGovernanceDetection: true
    workspace:
      clean: all
    pool: Onnxruntime-Linux-A10-24G
    steps:

    - checkout: self
      clean: true
      submodules: none

    - template: templates/flex-downloadPipelineArtifact.yml
      parameters:
        StepName: 'Download Onnxruntime Artifact'
        ArtifactName: 'drop-ort-linux-gpu'
        TargetPath: '$(Build.BinariesDirectory)/ort-artifact/'
        SpecificArtifact: ${{ parameters.specificArtifact }}
        BuildId: ${{ parameters.BuildId }}

    - script: |
        mkdir -p $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/ompffmpeg/
        azcopy cp --recursive "https://lotusscus.blob.core.windows.net/models/ffmpeg/runtimes/linux-x64/native" $(Agent.TempDirectory)/ompffmpeg
        cp $(Agent.TempDirectory)/ompffmpeg/native/* $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/ompffmpeg/
        # we need to copy the files to the docker context
        ls $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/ompffmpeg/
      displayName: 'Download OMP FFmpeg'

    - template: templates/get-docker-image-steps.yml
      parameters:
        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg
        Context: tools/ci_build/github/linux/docker/
        ScriptName: tools/ci_build/get_docker_image.py
        DockerBuildArgs: '--build-arg BUILD_UID=$( id -u )'
        Repository: onnxruntimepackagestest_ompffmpeg

    - task: DownloadPackage@1
      # The model data in artifact is downloaded from openai/whisper-large-v3 in huggingface model hub
      # In order to save size, removed .git directory and pickled files, and keep the safetensors model files
      displayName: 'Download Whisper Model'
      inputs:
        packageType: upack
        feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
        version: 1.0.0
        definition: 'b583ce7c-1a8f-4099-ae28-5d5f56c478b1'
        downloadPath: $(Agent.TempDirectory)/whisper_large_v3

    - script: |
        docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
           -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
           -v $(Agent.TempDirectory)/whisper_large_v3:/whisper_large_v3 \
           onnxruntimepackagestest_ompffmpeg \
            bash -c '
              set -ex; \
              pushd /workspace/onnxruntime/python/tools/transformers/ ; \
              python3 -m pip install --upgrade pip ; \
              pushd models/whisper ; \
              python3 -m pip install -r requirements.txt ; \
              popd ; \
              python3 -m pip install /ort-artifact/*.whl ; \
              python3 -m pip uninstall -y torch ; \
              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
              python3 -m models.whisper.convert_to_onnx -m /whisper_large_v3 --output whisperlargev3 --use_external_data_format ; \
              popd ; \
            '
      displayName: 'Convert Whisper Model'
      workingDirectory: $(Build.SourcesDirectory)

    - script: |
        docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
           -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
           -v $(Agent.TempDirectory)/whisper_large_v3:/whisper_large_v3 \
           onnxruntimepackagestest_ompffmpeg \
            bash -c '
              set -ex; \
              pushd /workspace/onnxruntime/python/tools/transformers/ ; \
              python3 -m pip install --upgrade pip ; \
              pushd models/whisper ; \
              python3 -m pip install -r requirements.txt ; \
              popd ; \
              python3 -m pip install /ort-artifact/*.whl ; \
              python3 -m pip uninstall -y torch ; \
              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
              ls whisperlargev3; \
              export LD_LIBRARY_PATH=/tmp/ompffmpeg:${LD_LIBRARY_PATH}; \
              ffmpeg -version; \
              python3 -m models.whisper.benchmark \
                  --benchmark-type ort \
                  --audio-path models/whisper/test/1272-141231-0002.mp3 \
                  --model-name openai/whisper-large-v3 \
                  --ort-model-path /workspace/onnxruntime/python/tools/transformers/whisperlargev3/whisper_large_v3_beamsearch.onnx \
                  --precision fp32 \
                  --device cuda > ort_output.txt ; \
              cat ort_output.txt ; \
              diff ort_output.txt /workspace/onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt && exit 0 || exit 1
              popd ; \
            '
      displayName: 'Test Whisper ONNX Model'
      workingDirectory: $(Build.SourcesDirectory)