diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 8d9f08cee0..31ebf58b03 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -86,7 +86,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF) # use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead. cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS" OFF) -option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF) +cmake_dependent_option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" ON "onnxruntime_USE_CUDA" OFF) option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF) option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF) option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF) diff --git a/dockerfiles/Dockerfile.cuda b/dockerfiles/Dockerfile.cuda index d2d656648f..ce4560e9b0 100644 --- a/dockerfiles/Dockerfile.cuda +++ b/dockerfiles/Dockerfile.cuda @@ -56,7 +56,6 @@ RUN cd /code \ --build_shared_lib --skip_tests \ --config Release --build_wheel --update --build --parallel \ --cmake_generator Ninja \ - --enable_cuda_nhwc_ops \ --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) "CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" onnxruntime_BUILD_UNIT_TESTS=OFF # Start second stage to copy the build artifacts diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index 5fb1e54b38..e23a52757d 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -925,6 +925,35 @@ Do not modify directly.* |WhisperBeamSearch|*in* input_ids:**F**
*in* max_length:**I**
*in* min_length:**I**
*in* num_beams:**I**
*in* num_return_sequences:**I**
*in* length_penalty:**T**
*in* repetition_penalty:**T**
*in* vocab_mask:**M**
*in* prefix_vocab_mask:**M**
*in* attention_mask:**I**
*in* decoder_input_ids:**I**
*in* logits_processor:**I**
*in* cross_qk_layer_head:**I**
*in* extra_decoding_ids:**I**
*in* temperature:**T**
*out* sequences:**I**
*out* sequences_scores:**T**
*out* scores:**T**
*out* cross_qk:**V**
*out* non_speech_probs:**T**|1+|**T** = tensor(float), tensor(float16)| | | | | +|**Operator Domain:** *com.ms.internal.nhwc*|||| +|AveragePool|*in* X:**T**
*out* Y:**T**|11+|**T** = tensor(float), tensor(float16)| +|||10|**T** = tensor(float), tensor(float16)| +|||[7, 9]|**T** = tensor(float), tensor(float16)| +|BatchNormalization|*in* X:**T**
*in* scale:**T**
*in* B:**T**
*in* input_mean:**U**
*in* input_var:**U**
*out* Y:**T**
*out* running_mean:**U**
*out* running_var:**U**

or

*in* X:**T**
*in* scale:**T**
*in* B:**T**
*in* mean:**T**
*in* var:**T**
*out* Y:**T**
*out* mean:**T**
*out* var:**T**
*out* saved_mean:**T**
*out* saved_var:**T**

or

*in* X:**T**
*in* scale:**T1**
*in* B:**T1**
*in* input_mean:**T2**
*in* input_var:**T2**
*out* Y:**T**
*out* running_mean:**T2**
*out* running_var:**T2**|15+|**T** = tensor(double), tensor(float), tensor(float16)
**T1** = tensor(double), tensor(float), tensor(float16)
**T2** = tensor(double), tensor(float), tensor(float16)| +|||14|**T** = tensor(double), tensor(float), tensor(float16)
**U** = tensor(double), tensor(float), tensor(float16)| +|||[9, 13]|**T** = tensor(double), tensor(float), tensor(float16)| +|||[7, 8]|**T** = tensor(double), tensor(float), tensor(float16)| +|Conv|*in* X:**T**
*in* W:**T**
*in* B:**T**
*out* Y:**T**|11+|**T** = tensor(float), tensor(float16)| +|||[1, 10]|**T** = tensor(float), tensor(float16)| +|ConvTranspose|*in* X:**T**
*in* W:**T**
*in* B:**T**
*out* Y:**T**|11+|**T** = tensor(float), tensor(float16)| +|||[1, 10]|**T** = tensor(float), tensor(float16)| +|DepthToSpace|*in* input:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)| +|||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)| +|||[1, 10]|**T** = tensor(double), tensor(float), tensor(float16)| +|GlobalAveragePool|*in* X:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)| +|GlobalMaxPool|*in* X:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)| +|GridSample|*in* X:**T1**
*in* grid:**T2**
*out* Y:**T1**|16+|**T1** = tensor(float)
**T2** = tensor(float)| +|LRN|*in* X:**T**
*out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)| +|||[1, 12]|**T** = tensor(double), tensor(float), tensor(float16)| +|MaxPool|*in* X:**T**
*out* Y:**T**

or

*in* X:**T**
*out* Y:**T**
*out* Indices:**I**|12+|**I** = tensor(int64)
**T** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)| +|||11|**I** = tensor(int64)
**T** = tensor(float), tensor(float16)| +|||10|**I** = tensor(int64)
**T** = tensor(float), tensor(float16)| +|||[8, 9]|**I** = tensor(int64)
**T** = tensor(float), tensor(float16)| +|||[1, 7]|**T** = tensor(float), tensor(float16)| +|SpaceToDepth|*in* input:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)| +|||[1, 12]|**T** = tensor(double), tensor(float), tensor(float16)| +| | +| | diff --git a/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.sh b/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.sh index e6da988f5c..9e97867657 100644 --- a/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.sh +++ b/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.sh @@ -191,7 +191,6 @@ build_onnxruntime_gpu_for_profiling() { --build_wheel --skip_tests \ --cmake_generator Ninja \ --compile_no_warning_as_error \ - --enable_cuda_nhwc_ops \ --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=$CUDA_ARCH \ --cmake_extra_defines onnxruntime_ENABLE_NVTX_PROFILE=ON \ --enable_cuda_line_info diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc index 25caa732ef..a3a3dd939c 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc @@ -1,8 +1,9 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. - +#include "core/graph/constants.h" #include "gtest/gtest.h" #include "test/providers/provider_test_utils.h" + using namespace std; namespace onnxruntime { namespace test { @@ -28,7 +29,8 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes, optional epsilon = optional(), OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess, const std::string& err_str = "", - int opset = 7) { + int opset = 7, + bool exclude_cuda_nhwc = false) { OpTester test("Conv", opset); test.AddAttribute("group", attributes.group); test.AddAttribute("kernel_shape", attributes.kernel_shape); @@ -65,6 +67,12 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes, // Disable TensorRT because weight as input is not supported excluded_providers.insert(kTensorrtExecutionProvider); + if (exclude_cuda_nhwc) { +#ifdef ENABLE_CUDA_NHWC_OPS + excluded_providers.insert(kCudaNHWCExecutionProvider); +#endif + } + // QNN SDK 2.10.0 has a bug that breaks support for dynamic bias inputs. excluded_providers.insert(kQnnExecutionProvider); @@ -197,10 +205,15 @@ TEST(ConvTest, Conv1D_Bias) { // as TF32 has a 10 bit mantissa. float epsilon = 1.1e-5f; - TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, false, epsilon); + // This case is not supported by cuDNN frontend, and the fallback (legacy code) requires weight to 4D tensor for NHWC. + constexpr bool exclude_cuda_nhwc = true; + + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, false, epsilon, + OpTester::ExpectResult::kExpectSuccess, "", 10, exclude_cuda_nhwc); // CoreML EP requires weight to be an initializer - TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true, epsilon); + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true, epsilon, + OpTester::ExpectResult::kExpectSuccess, "", 10, exclude_cuda_nhwc); } // Conv47 diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 5cc040fd70..24dc6124d4 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -13,6 +13,7 @@ import shlex import shutil import subprocess import sys +import warnings from pathlib import Path @@ -253,7 +254,12 @@ def parse_arguments(): "--cudnn_home is not specified.", ) parser.add_argument("--enable_cuda_line_info", action="store_true", help="Enable CUDA line info.") - parser.add_argument("--enable_cuda_nhwc_ops", action="store_true", help="Enable CUDA NHWC ops in build.") + + parser.add_argument( + "--enable_cuda_nhwc_ops", action="store_true", help="Deprecated; default to enable CUDA NHWC ops in build." + ) + + parser.add_argument("--disable_cuda_nhwc_ops", action="store_true", help="Disable CUDA NHWC ops in build.") # Python bindings parser.add_argument("--enable_pybind", action="store_true", help="Enable Python Bindings.") @@ -793,6 +799,11 @@ def parse_arguments(): if args.cmake_generator is None and is_windows(): args.cmake_generator = "Ninja" if args.build_wasm else "Visual Studio 17 2022" + if args.enable_cuda_nhwc_ops: + warnings.warn( + "The argument '--enable_cuda_nhwc_ops' is deprecated and is default to True. ", DeprecationWarning + ) + return args @@ -1074,7 +1085,7 @@ def generate_build_tree( "-Donnxruntime_USE_MPI=" + ("ON" if args.use_mpi else "OFF"), "-Donnxruntime_ENABLE_MEMORY_PROFILE=" + ("ON" if args.enable_memory_profile else "OFF"), "-Donnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO=" + ("ON" if args.enable_cuda_line_info else "OFF"), - "-Donnxruntime_USE_CUDA_NHWC_OPS=" + ("ON" if args.enable_cuda_nhwc_ops else "OFF"), + "-Donnxruntime_USE_CUDA_NHWC_OPS=" + ("ON" if args.use_cuda and not args.disable_cuda_nhwc_ops else "OFF"), "-Donnxruntime_BUILD_WEBASSEMBLY_STATIC_LIB=" + ("ON" if args.build_wasm_static_lib else "OFF"), "-Donnxruntime_ENABLE_WEBASSEMBLY_EXCEPTION_CATCHING=" + ("OFF" if args.disable_wasm_exception_catching else "ON"), diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml index 5af95d3457..0da1f8fc8d 100644 --- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml @@ -123,7 +123,7 @@ stages: --parallel \ --build_wheel \ --enable_onnx_tests --use_cuda --cuda_version=11.8 --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \ - --enable_cuda_profiling --enable_cuda_nhwc_ops \ + --enable_cuda_profiling \ --enable_pybind --build_java \ --use_cache \ --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=75;86' ; \ diff --git a/tools/ci_build/github/linux/build_cuda_ci.sh b/tools/ci_build/github/linux/build_cuda_ci.sh index a78e240998..0533b7b394 100755 --- a/tools/ci_build/github/linux/build_cuda_ci.sh +++ b/tools/ci_build/github/linux/build_cuda_ci.sh @@ -3,28 +3,31 @@ set -ex #Every cuda container has this $CUDA_VERSION env var set. SHORT_CUDA_VERSION=$(echo $CUDA_VERSION | sed 's/\([[:digit:]]\+\.[[:digit:]]\+\)\.[[:digit:]]\+/\1/') -BUILD_ARGS=('--config' 'Release' '--update' '--build' - '--skip_submodule_sync' - '--build_shared_lib' - '--parallel' '--use_binskim_compliant_compile_flags' - '--build_wheel' - '--enable_onnx_tests' - '--use_cuda' - "--cuda_version=$SHORT_CUDA_VERSION" - "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION" - "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" - "--enable_cuda_profiling" - "--enable_cuda_nhwc_ops" - "--enable_pybind" - "--build_java" - "--cmake_extra_defines" - "CMAKE_CUDA_ARCHITECTURES=75" - "onnxruntime_BUILD_UNIT_TESTS=ON" - "onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON") +BUILD_ARGS=('--config' + 'Release' + '--update' + '--build' + '--skip_submodule_sync' + '--build_shared_lib' + '--parallel' + '--use_binskim_compliant_compile_flags' + '--build_wheel' + '--enable_onnx_tests' + '--use_cuda' + "--cuda_version=$SHORT_CUDA_VERSION" + "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION" + "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" + "--enable_cuda_profiling" + "--enable_pybind" + "--build_java" + "--cmake_extra_defines" + "CMAKE_CUDA_ARCHITECTURES=75" + "onnxruntime_BUILD_UNIT_TESTS=ON" + "onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON") if [ -x "$(command -v ninja)" ]; then BUILD_ARGS+=('--cmake_generator' 'Ninja') fi - + if [ -d /build ]; then BUILD_ARGS+=('--build_dir' '/build') else @@ -40,7 +43,7 @@ if [ -f /opt/python/cp312-cp312/bin/python3 ]; then else python3 tools/ci_build/build.py "${BUILD_ARGS[@]}" fi -if [ -x "$(command -v ccache)" ]; then - ccache -sv +if [ -x "$(command -v ccache)" ]; then + ccache -sv ccache -z fi