diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 8d9f08cee0..31ebf58b03 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -86,7 +86,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
# use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead.
cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS" OFF)
-option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF)
+cmake_dependent_option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" ON "onnxruntime_USE_CUDA" OFF)
option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF)
option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF)
option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
diff --git a/dockerfiles/Dockerfile.cuda b/dockerfiles/Dockerfile.cuda
index d2d656648f..ce4560e9b0 100644
--- a/dockerfiles/Dockerfile.cuda
+++ b/dockerfiles/Dockerfile.cuda
@@ -56,7 +56,6 @@ RUN cd /code \
--build_shared_lib --skip_tests \
--config Release --build_wheel --update --build --parallel \
--cmake_generator Ninja \
- --enable_cuda_nhwc_ops \
--cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) "CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" onnxruntime_BUILD_UNIT_TESTS=OFF
# Start second stage to copy the build artifacts
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 5fb1e54b38..e23a52757d 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -925,6 +925,35 @@ Do not modify directly.*
|WhisperBeamSearch|*in* input_ids:**F**
*in* max_length:**I**
*in* min_length:**I**
*in* num_beams:**I**
*in* num_return_sequences:**I**
*in* length_penalty:**T**
*in* repetition_penalty:**T**
*in* vocab_mask:**M**
*in* prefix_vocab_mask:**M**
*in* attention_mask:**I**
*in* decoder_input_ids:**I**
*in* logits_processor:**I**
*in* cross_qk_layer_head:**I**
*in* extra_decoding_ids:**I**
*in* temperature:**T**
*out* sequences:**I**
*out* sequences_scores:**T**
*out* scores:**T**
*out* cross_qk:**V**
*out* non_speech_probs:**T**|1+|**T** = tensor(float), tensor(float16)|
| |
| |
+|**Operator Domain:** *com.ms.internal.nhwc*||||
+|AveragePool|*in* X:**T**
*out* Y:**T**|11+|**T** = tensor(float), tensor(float16)|
+|||10|**T** = tensor(float), tensor(float16)|
+|||[7, 9]|**T** = tensor(float), tensor(float16)|
+|BatchNormalization|*in* X:**T**
*in* scale:**T**
*in* B:**T**
*in* input_mean:**U**
*in* input_var:**U**
*out* Y:**T**
*out* running_mean:**U**
*out* running_var:**U**
or
*in* X:**T**
*in* scale:**T**
*in* B:**T**
*in* mean:**T**
*in* var:**T**
*out* Y:**T**
*out* mean:**T**
*out* var:**T**
*out* saved_mean:**T**
*out* saved_var:**T**
or
*in* X:**T**
*in* scale:**T1**
*in* B:**T1**
*in* input_mean:**T2**
*in* input_var:**T2**
*out* Y:**T**
*out* running_mean:**T2**
*out* running_var:**T2**|15+|**T** = tensor(double), tensor(float), tensor(float16)
**T1** = tensor(double), tensor(float), tensor(float16)
**T2** = tensor(double), tensor(float), tensor(float16)|
+|||14|**T** = tensor(double), tensor(float), tensor(float16)
**U** = tensor(double), tensor(float), tensor(float16)|
+|||[9, 13]|**T** = tensor(double), tensor(float), tensor(float16)|
+|||[7, 8]|**T** = tensor(double), tensor(float), tensor(float16)|
+|Conv|*in* X:**T**
*in* W:**T**
*in* B:**T**
*out* Y:**T**|11+|**T** = tensor(float), tensor(float16)|
+|||[1, 10]|**T** = tensor(float), tensor(float16)|
+|ConvTranspose|*in* X:**T**
*in* W:**T**
*in* B:**T**
*out* Y:**T**|11+|**T** = tensor(float), tensor(float16)|
+|||[1, 10]|**T** = tensor(float), tensor(float16)|
+|DepthToSpace|*in* input:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
+|||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
+|||[1, 10]|**T** = tensor(double), tensor(float), tensor(float16)|
+|GlobalAveragePool|*in* X:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|GlobalMaxPool|*in* X:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|GridSample|*in* X:**T1**
*in* grid:**T2**
*out* Y:**T1**|16+|**T1** = tensor(float)
**T2** = tensor(float)|
+|LRN|*in* X:**T**
*out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
+|||[1, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
+|MaxPool|*in* X:**T**
*out* Y:**T**
or
*in* X:**T**
*out* Y:**T**
*out* Indices:**I**|12+|**I** = tensor(int64)
**T** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)|
+|||11|**I** = tensor(int64)
**T** = tensor(float), tensor(float16)|
+|||10|**I** = tensor(int64)
**T** = tensor(float), tensor(float16)|
+|||[8, 9]|**I** = tensor(int64)
**T** = tensor(float), tensor(float16)|
+|||[1, 7]|**T** = tensor(float), tensor(float16)|
+|SpaceToDepth|*in* input:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
+|||[1, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
+| |
+| |
diff --git a/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.sh b/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.sh
index e6da988f5c..9e97867657 100644
--- a/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.sh
+++ b/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.sh
@@ -191,7 +191,6 @@ build_onnxruntime_gpu_for_profiling() {
--build_wheel --skip_tests \
--cmake_generator Ninja \
--compile_no_warning_as_error \
- --enable_cuda_nhwc_ops \
--cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=$CUDA_ARCH \
--cmake_extra_defines onnxruntime_ENABLE_NVTX_PROFILE=ON \
--enable_cuda_line_info
diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
index 25caa732ef..a3a3dd939c 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
@@ -1,8 +1,9 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
-
+#include "core/graph/constants.h"
#include "gtest/gtest.h"
#include "test/providers/provider_test_utils.h"
+
using namespace std;
namespace onnxruntime {
namespace test {
@@ -28,7 +29,8 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes,
optional epsilon = optional(),
OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess,
const std::string& err_str = "",
- int opset = 7) {
+ int opset = 7,
+ bool exclude_cuda_nhwc = false) {
OpTester test("Conv", opset);
test.AddAttribute("group", attributes.group);
test.AddAttribute("kernel_shape", attributes.kernel_shape);
@@ -65,6 +67,12 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes,
// Disable TensorRT because weight as input is not supported
excluded_providers.insert(kTensorrtExecutionProvider);
+ if (exclude_cuda_nhwc) {
+#ifdef ENABLE_CUDA_NHWC_OPS
+ excluded_providers.insert(kCudaNHWCExecutionProvider);
+#endif
+ }
+
// QNN SDK 2.10.0 has a bug that breaks support for dynamic bias inputs.
excluded_providers.insert(kQnnExecutionProvider);
@@ -197,10 +205,15 @@ TEST(ConvTest, Conv1D_Bias) {
// as TF32 has a 10 bit mantissa.
float epsilon = 1.1e-5f;
- TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, false, epsilon);
+ // This case is not supported by cuDNN frontend, and the fallback (legacy code) requires weight to 4D tensor for NHWC.
+ constexpr bool exclude_cuda_nhwc = true;
+
+ TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, false, epsilon,
+ OpTester::ExpectResult::kExpectSuccess, "", 10, exclude_cuda_nhwc);
// CoreML EP requires weight to be an initializer
- TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true, epsilon);
+ TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true, epsilon,
+ OpTester::ExpectResult::kExpectSuccess, "", 10, exclude_cuda_nhwc);
}
// Conv47
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 5cc040fd70..24dc6124d4 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -13,6 +13,7 @@ import shlex
import shutil
import subprocess
import sys
+import warnings
from pathlib import Path
@@ -253,7 +254,12 @@ def parse_arguments():
"--cudnn_home is not specified.",
)
parser.add_argument("--enable_cuda_line_info", action="store_true", help="Enable CUDA line info.")
- parser.add_argument("--enable_cuda_nhwc_ops", action="store_true", help="Enable CUDA NHWC ops in build.")
+
+ parser.add_argument(
+ "--enable_cuda_nhwc_ops", action="store_true", help="Deprecated; default to enable CUDA NHWC ops in build."
+ )
+
+ parser.add_argument("--disable_cuda_nhwc_ops", action="store_true", help="Disable CUDA NHWC ops in build.")
# Python bindings
parser.add_argument("--enable_pybind", action="store_true", help="Enable Python Bindings.")
@@ -793,6 +799,11 @@ def parse_arguments():
if args.cmake_generator is None and is_windows():
args.cmake_generator = "Ninja" if args.build_wasm else "Visual Studio 17 2022"
+ if args.enable_cuda_nhwc_ops:
+ warnings.warn(
+ "The argument '--enable_cuda_nhwc_ops' is deprecated and is default to True. ", DeprecationWarning
+ )
+
return args
@@ -1074,7 +1085,7 @@ def generate_build_tree(
"-Donnxruntime_USE_MPI=" + ("ON" if args.use_mpi else "OFF"),
"-Donnxruntime_ENABLE_MEMORY_PROFILE=" + ("ON" if args.enable_memory_profile else "OFF"),
"-Donnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO=" + ("ON" if args.enable_cuda_line_info else "OFF"),
- "-Donnxruntime_USE_CUDA_NHWC_OPS=" + ("ON" if args.enable_cuda_nhwc_ops else "OFF"),
+ "-Donnxruntime_USE_CUDA_NHWC_OPS=" + ("ON" if args.use_cuda and not args.disable_cuda_nhwc_ops else "OFF"),
"-Donnxruntime_BUILD_WEBASSEMBLY_STATIC_LIB=" + ("ON" if args.build_wasm_static_lib else "OFF"),
"-Donnxruntime_ENABLE_WEBASSEMBLY_EXCEPTION_CATCHING="
+ ("OFF" if args.disable_wasm_exception_catching else "ON"),
diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
index 5af95d3457..0da1f8fc8d 100644
--- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -123,7 +123,7 @@ stages:
--parallel \
--build_wheel \
--enable_onnx_tests --use_cuda --cuda_version=11.8 --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \
- --enable_cuda_profiling --enable_cuda_nhwc_ops \
+ --enable_cuda_profiling \
--enable_pybind --build_java \
--use_cache \
--cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=75;86' ; \
diff --git a/tools/ci_build/github/linux/build_cuda_ci.sh b/tools/ci_build/github/linux/build_cuda_ci.sh
index a78e240998..0533b7b394 100755
--- a/tools/ci_build/github/linux/build_cuda_ci.sh
+++ b/tools/ci_build/github/linux/build_cuda_ci.sh
@@ -3,28 +3,31 @@ set -ex
#Every cuda container has this $CUDA_VERSION env var set.
SHORT_CUDA_VERSION=$(echo $CUDA_VERSION | sed 's/\([[:digit:]]\+\.[[:digit:]]\+\)\.[[:digit:]]\+/\1/')
-BUILD_ARGS=('--config' 'Release' '--update' '--build'
- '--skip_submodule_sync'
- '--build_shared_lib'
- '--parallel' '--use_binskim_compliant_compile_flags'
- '--build_wheel'
- '--enable_onnx_tests'
- '--use_cuda'
- "--cuda_version=$SHORT_CUDA_VERSION"
- "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION"
- "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION"
- "--enable_cuda_profiling"
- "--enable_cuda_nhwc_ops"
- "--enable_pybind"
- "--build_java"
- "--cmake_extra_defines"
- "CMAKE_CUDA_ARCHITECTURES=75"
- "onnxruntime_BUILD_UNIT_TESTS=ON"
- "onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON")
+BUILD_ARGS=('--config'
+ 'Release'
+ '--update'
+ '--build'
+ '--skip_submodule_sync'
+ '--build_shared_lib'
+ '--parallel'
+ '--use_binskim_compliant_compile_flags'
+ '--build_wheel'
+ '--enable_onnx_tests'
+ '--use_cuda'
+ "--cuda_version=$SHORT_CUDA_VERSION"
+ "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION"
+ "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION"
+ "--enable_cuda_profiling"
+ "--enable_pybind"
+ "--build_java"
+ "--cmake_extra_defines"
+ "CMAKE_CUDA_ARCHITECTURES=75"
+ "onnxruntime_BUILD_UNIT_TESTS=ON"
+ "onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON")
if [ -x "$(command -v ninja)" ]; then
BUILD_ARGS+=('--cmake_generator' 'Ninja')
fi
-
+
if [ -d /build ]; then
BUILD_ARGS+=('--build_dir' '/build')
else
@@ -40,7 +43,7 @@ if [ -f /opt/python/cp312-cp312/bin/python3 ]; then
else
python3 tools/ci_build/build.py "${BUILD_ARGS[@]}"
fi
-if [ -x "$(command -v ccache)" ]; then
- ccache -sv
+if [ -x "$(command -v ccache)" ]; then
+ ccache -sv
ccache -z
fi