diff --git a/setup.py b/setup.py index 9ad5390a11..2f5b99b5e1 100644 --- a/setup.py +++ b/setup.py @@ -445,6 +445,7 @@ requirements_file = "requirements.txt" local_version = None enable_training = parse_arg_remove_boolean(sys.argv, "--enable_training") enable_training_on_device = parse_arg_remove_boolean(sys.argv, "--enable_training_on_device") +enable_rocm_profiling = parse_arg_remove_boolean(sys.argv, "--enable_rocm_profiling") disable_auditwheel_repair = parse_arg_remove_boolean(sys.argv, "--disable_auditwheel_repair") default_training_package_device = parse_arg_remove_boolean(sys.argv, "--default_training_package_device") @@ -611,6 +612,8 @@ if nightly_build: if local_version: version_number = version_number + local_version + if is_rocm and enable_rocm_profiling: + version_number = version_number + ".profiling" if wheel_name_suffix: if not (enable_training and wheel_name_suffix == "gpu"): diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index fa3a646895..23f35d9654 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -2053,6 +2053,7 @@ def build_python_wheel( use_ninja=False, build_eager_mode=False, enable_training_on_device=False, + enable_rocm_profiling=False, ): for config in configs: cwd = get_config_build_dir(build_dir, config) @@ -2074,6 +2075,8 @@ def build_python_wheel( args.append("--enable_training_on_device") if build_eager_mode: args.append("--disable_auditwheel_repair") + if enable_rocm_profiling: + args.append("--enable_rocm_profiling") # The following arguments are mutually exclusive if use_cuda: @@ -2776,6 +2779,7 @@ def main(): use_ninja=(args.cmake_generator == "Ninja"), build_eager_mode=args.build_eager_mode, enable_training_on_device=args.enable_training_on_device, + enable_rocm_profiling=args.enable_rocm_profiling, ) if args.build_nuget: build_nuget_package( diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-rocm.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-rocm.yml index 1bf6cf1a47..4505e1249a 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-rocm.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-rocm.yml @@ -35,3 +35,18 @@ stages: parameters: PythonVersion: '3.9' RocmVersion: '5.3' + - template: templates/rocm.yml + parameters: + PythonVersion: '3.7' + RocmVersion: '5.3' + BuildConfig: 'RelWithDebInfo' + - template: templates/rocm.yml + parameters: + PythonVersion: '3.8' + RocmVersion: '5.3' + BuildConfig: 'RelWithDebInfo' + - template: templates/rocm.yml + parameters: + PythonVersion: '3.9' + RocmVersion: '5.3' + BuildConfig: 'RelWithDebInfo' diff --git a/tools/ci_build/github/azure-pipelines/templates/rocm.yml b/tools/ci_build/github/azure-pipelines/templates/rocm.yml index 3250fe5065..7020c4b683 100644 --- a/tools/ci_build/github/azure-pipelines/templates/rocm.yml +++ b/tools/ci_build/github/azure-pipelines/templates/rocm.yml @@ -5,18 +5,39 @@ parameters: - name: RocmVersion type: string - +- name: BuildConfig + type: string + default: 'Release' jobs: -- job: wheels_python_${{ replace(parameters.PythonVersion,'.','_') }}_rocm_${{ replace(parameters.RocmVersion,'.','_') }} +- job: wheels_python_${{ replace(parameters.PythonVersion,'.','_') }}_rocm_${{ replace(parameters.RocmVersion,'.','_') }}_${{ parameters.BuildConfig }} workspace: clean: all timeoutInMinutes: 180 pool: Ubuntu-2004-rocm-aiinfra variables: - PythonVersion: ${{ parameters.PythonVersion }} + - name: PythonVersion + value: ${{ parameters.PythonVersion }} + - name: EnableProfiling + ${{ if eq(parameters.BuildConfig, 'Release') }}: + value: '' + ${{ else }}: + value: '--enable_rocm_profiling' + - name: ArtifactName + ${{ if eq(parameters.BuildConfig, 'Release') }}: + value: 'onnxruntime_rocm' + ${{ else }}: + value: 'onnxruntime_rocm_enable_profiling' steps: + - task: CmdLine@2 + displayName: 'check variables' + inputs: + script: | + echo "BuildConfig is "${{ parameters.BuildConfig }} && \ + echo "EnableProfiling is "${{ variables['EnableProfiling'] }} && \ + echo "ArtifactName is "${{ variables['ArtifactName'] }} + - checkout: self clean: true submodules: recursive @@ -34,6 +55,7 @@ jobs: --build-arg PREPEND_PATH=/opt/rh/devtoolset-10/root/usr/bin: --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-10/root/usr/lib64:/opt/rh/devtoolset-10/root/usr/lib:/opt/rh/devtoolset-10/root/usr/lib64/dyninst:/opt/rh/devtoolset-10/root/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib Repository: onnxruntimetrainingrocmbuild-rocm${{ parameters.RocmVersion }} + - task: CmdLine@2 inputs: script: | @@ -53,7 +75,7 @@ jobs: --user onnxruntimedev \ onnxruntimetrainingrocmbuild-rocm${{ parameters.RocmVersion }} \ /onnxruntime_src/tools/ci_build/build.py \ - --config Release \ + --config ${{ parameters.BuildConfig }} \ --use_rocm \ --rocm_version=${{ parameters.RocmVersion }} \ --rocm_home=/opt/rocm \ @@ -68,7 +90,8 @@ jobs: --cmake_extra_defines \ CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \ onnxruntime_BUILD_UNIT_TESTS=OFF \ - --enable_training_torch_interop + --enable_training_torch_interop \ + ${{ variables['EnableProfiling'] }} workingDirectory: $(Build.SourcesDirectory) displayName: 'Build onnxruntime (in container)' @@ -85,18 +108,19 @@ jobs: render_gid=$(getent group | awk '/render/ {split($0,a,":"); print(a[3])}') echo "Found render_gid=$render_gid; attempting to set as pipeline variable" echo "##vso[task.setvariable variable=render]$render_gid" + condition: and(succeeded(), eq('${{ parameters.BuildConfig }}', 'Release')) displayName: 'Find video and render gid to be mapped into container' - task: CopyFiles@2 displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' inputs: SourceFolder: '$(Build.BinariesDirectory)' - Contents: 'Release/dist/*.whl' + Contents: "${{ parameters.BuildConfig }}/dist/*.whl" TargetFolder: '$(Build.ArtifactStagingDirectory)' - task: CmdLine@2 displayName: 'Build Python Documentation' - condition: and(succeeded(), ne(variables['PythonVersion'], '3.9')) # tensorflow not available on python 3.9 + condition: and(succeeded(), ne('${{ parameters.PythonVersion }}', '3.9'), eq('${{ parameters.BuildConfig }}', 'Release')) # tensorflow not available on python 3.9 inputs: script: | mkdir -p $HOME/.onnx @@ -123,7 +147,7 @@ jobs: - task: CopyFiles@2 displayName: 'Copy Python Documentation to: $(Build.ArtifactStagingDirectory)' - condition: and(succeeded(), ne(variables['PythonVersion'], '3.9')) # tensorflow not available on python 3.9 + condition: and(succeeded(), ne('${{ parameters.PythonVersion }}', '3.9'), eq('${{ parameters.BuildConfig }}', 'Release')) # tensorflow not available on python 3.9 inputs: SourceFolder: '$(Build.BinariesDirectory)/docs/training/html' Contents: '**' @@ -132,10 +156,10 @@ jobs: - task: PublishBuildArtifacts@1 displayName: 'Upload Rocm wheel as build artifact' inputs: - ArtifactName: onnxruntime_rocm + ArtifactName: ${{ variables['ArtifactName'] }} - script: | - files=($(Build.ArtifactStagingDirectory)/Release/dist/*.whl) && \ + files=($(Build.ArtifactStagingDirectory)/${{ parameters.BuildConfig }}/dist/*.whl) && \ echo ${files[0]} && \ python3 tools/ci_build/upload_python_package_to_azure_storage.py \ --python_wheel_path ${files[0]} \