From 740679d3290b0df594d2e14be959dfbdb405f80b Mon Sep 17 00:00:00 2001 From: Abhishek Jindal Date: Tue, 30 Nov 2021 10:33:13 -0800 Subject: [PATCH 1/8] Abjindal/fix windows ci pipeline (#9883) * switching to /wd4800 for eager mode * fixing compile flags ignore warnings, previously it was only using the last one --- cmake/CMakeLists.txt | 2 +- cmake/onnxruntime_eager.cmake | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index acbde7f56a..0b09078e0e 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -962,7 +962,7 @@ if (WIN32) # issued by thrust nonstandard extension used: nameless struct/union list(APPEND ORT_WARNING_FLAGS "/wd4201") # warning C4800: Implicit conversion from 'X' to bool. Possible information loss - if (onnxruntime_USE_OPENVINO) + if (onnxruntime_USE_OPENVINO OR onnxruntime_ENABLE_EAGER_MODE) list(APPEND ORT_WARNING_FLAGS "/wd4800") else() list(APPEND ORT_WARNING_FLAGS "/w34800") diff --git a/cmake/onnxruntime_eager.cmake b/cmake/onnxruntime_eager.cmake index 2fb2f8a852..88d05a2f14 100644 --- a/cmake/onnxruntime_eager.cmake +++ b/cmake/onnxruntime_eager.cmake @@ -10,14 +10,14 @@ source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_eager_srcs}) add_library(onnxruntime_eager ${onnxruntime_eager_srcs}) if(MSVC AND onnxruntime_ENABLE_EAGER_MODE) - set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_aten.cpp" PROPERTIES COMPILE_FLAGS "/wd4100" COMPILE_FLAGS "/wd4458") + set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_aten.cpp" PROPERTIES COMPILE_FLAGS "/wd4100 /wd4458") set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_customops.g.cpp" PROPERTIES COMPILE_FLAGS "/wd4100") set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_backends.cpp" PROPERTIES COMPILE_FLAGS "/wd4100") set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_hooks.cpp" PROPERTIES COMPILE_FLAGS "/wd4100") set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_eager.cpp" PROPERTIES COMPILE_FLAGS "/wd4100") - set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_log.cpp" PROPERTIES COMPILE_FLAGS "/wd4100" COMPILE_FLAGS "/wd4324") + set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_log.cpp" PROPERTIES COMPILE_FLAGS "/wd4100 /wd4324") set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_guard.cpp" PROPERTIES COMPILE_FLAGS "/wd4100") - set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_tensor.cpp" PROPERTIES COMPILE_FLAGS "/wd4100" COMPILE_FLAGS "/wd4458" COMPILE_FLAGS "/wd4127") + set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_tensor.cpp" PROPERTIES COMPILE_FLAGS "/wd4100 /wd4458 /wd4127") set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_ops.cpp" PROPERTIES COMPILE_FLAGS "/wd4100") set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_util.cpp" PROPERTIES COMPILE_FLAGS "/wd4100") endif() From 6de79d82c85f1acff58d0ba31ef48787b0e02152 Mon Sep 17 00:00:00 2001 From: Sherlock Date: Tue, 30 Nov 2021 15:26:10 -0800 Subject: [PATCH 2/8] Fix Training Packaging pipeline (#9885) * Fix Training Packaging pipeline --- docs/python/inference/examples/plot_common_errors.py | 2 +- .../inference/examples/plot_convert_pipeline_vectorizer.py | 2 +- docs/python/inference/examples/plot_load_and_predict.py | 2 +- docs/python/inference/examples/plot_metadata.py | 4 ++-- docs/python/inference/examples/plot_profiling.py | 4 ++-- .../python/inference/examples/plot_train_convert_predict.py | 6 +++--- docs/python/inference/tutorial.rst | 4 ++-- onnxruntime/test/python/onnxruntime_test_python_keras.py | 2 +- orttraining/tools/scripts/layer_norm_transform.py | 4 ++-- orttraining/tools/scripts/model_transform.py | 4 ++-- orttraining/tools/scripts/nv_run_pretraining.py | 2 +- 11 files changed, 18 insertions(+), 18 deletions(-) diff --git a/docs/python/inference/examples/plot_common_errors.py b/docs/python/inference/examples/plot_common_errors.py index 0d98e17c45..b474574c0f 100644 --- a/docs/python/inference/examples/plot_common_errors.py +++ b/docs/python/inference/examples/plot_common_errors.py @@ -21,7 +21,7 @@ import numpy from onnxruntime.datasets import get_example example2 = get_example("logreg_iris.onnx") -sess = rt.InferenceSession(example2) +sess = rt.InferenceSession(example2, providers=rt.get_available_providers()) input_name = sess.get_inputs()[0].name output_name = sess.get_outputs()[0].name diff --git a/docs/python/inference/examples/plot_convert_pipeline_vectorizer.py b/docs/python/inference/examples/plot_convert_pipeline_vectorizer.py index 0de0b30e28..af1351d0c8 100644 --- a/docs/python/inference/examples/plot_convert_pipeline_vectorizer.py +++ b/docs/python/inference/examples/plot_convert_pipeline_vectorizer.py @@ -72,7 +72,7 @@ with open("pipeline_vectorize.onnx", "wb") as f: import onnxruntime as rt from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument -sess = rt.InferenceSession("pipeline_vectorize.onnx") +sess = rt.InferenceSession("pipeline_vectorize.onnx", providers=rt.get_available_providers()) import numpy inp, out = sess.get_inputs()[0], sess.get_outputs()[0] diff --git a/docs/python/inference/examples/plot_load_and_predict.py b/docs/python/inference/examples/plot_load_and_predict.py index feb369feb2..9bfdc57957 100644 --- a/docs/python/inference/examples/plot_load_and_predict.py +++ b/docs/python/inference/examples/plot_load_and_predict.py @@ -21,7 +21,7 @@ from onnxruntime.datasets import get_example # The model is available on github `onnx...test_sigmoid `_. example1 = get_example("sigmoid.onnx") -sess = rt.InferenceSession(example1) +sess = rt.InferenceSession(example1, providers=rt.get_available_providers()) ######################### # Let's see the input name and shape. diff --git a/docs/python/inference/examples/plot_metadata.py b/docs/python/inference/examples/plot_metadata.py index df5d15276c..94c45e688f 100644 --- a/docs/python/inference/examples/plot_metadata.py +++ b/docs/python/inference/examples/plot_metadata.py @@ -31,8 +31,8 @@ print("producer_version={}".format(model.producer_version)) ############################# # With *ONNX Runtime*: -from onnxruntime import InferenceSession -sess = InferenceSession(example) +import onnxruntime as rt +sess = rt.InferenceSession(example, providers=rt.get_available_providers()) meta = sess.get_modelmeta() print("custom_metadata_map={}".format(meta.custom_metadata_map)) diff --git a/docs/python/inference/examples/plot_profiling.py b/docs/python/inference/examples/plot_profiling.py index f0ea727ede..402e7b3bae 100644 --- a/docs/python/inference/examples/plot_profiling.py +++ b/docs/python/inference/examples/plot_profiling.py @@ -35,7 +35,7 @@ def change_ir_version(filename, ir_version=6): example1 = get_example("mul_1.onnx") onnx_model = change_ir_version(example1) onnx_model_str = onnx_model.SerializeToString() -sess = rt.InferenceSession(onnx_model_str) +sess = rt.InferenceSession(onnx_model_str, providers=rt.get_available_providers()) input_name = sess.get_inputs()[0].name x = numpy.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=numpy.float32) @@ -48,7 +48,7 @@ print(res) options = rt.SessionOptions() options.enable_profiling = True -sess_profile = rt.InferenceSession(onnx_model_str, options) +sess_profile = rt.InferenceSession(onnx_model_str, options, providers=rt.get_available_providers()) input_name = sess.get_inputs()[0].name x = numpy.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=numpy.float32) diff --git a/docs/python/inference/examples/plot_train_convert_predict.py b/docs/python/inference/examples/plot_train_convert_predict.py index 5b060c5f41..4aa36b3dce 100644 --- a/docs/python/inference/examples/plot_train_convert_predict.py +++ b/docs/python/inference/examples/plot_train_convert_predict.py @@ -64,7 +64,7 @@ with open("logreg_iris.onnx", "wb") as f: # its input and output. import onnxruntime as rt -sess = rt.InferenceSession("logreg_iris.onnx") +sess = rt.InferenceSession("logreg_iris.onnx", providers=rt.get_available_providers()) print("input name='{}' and shape={}".format( sess.get_inputs()[0].name, sess.get_inputs()[0].shape)) @@ -180,7 +180,7 @@ with open("rf_iris.onnx", "wb") as f: ################################### # We compare. -sess = rt.InferenceSession("rf_iris.onnx") +sess = rt.InferenceSession("rf_iris.onnx", providers=rt.get_available_providers()) def sess_predict_proba_rf(x): return sess.run([prob_name], {input_name: x.astype(numpy.float32)})[0] @@ -204,7 +204,7 @@ for n_trees in range(5, 51, 5): onx = convert_sklearn(rf, initial_types=initial_type) with open("rf_iris_%d.onnx" % n_trees, "wb") as f: f.write(onx.SerializeToString()) - sess = rt.InferenceSession("rf_iris_%d.onnx" % n_trees) + sess = rt.InferenceSession("rf_iris_%d.onnx" % n_trees, providers=rt.get_available_providers()) def sess_predict_proba_loop(x): return sess.run([prob_name], {input_name: x.astype(numpy.float32)})[0] tsk = speed("loop(X_test, rf.predict_proba, 100)", number=5, repeat=5) diff --git a/docs/python/inference/tutorial.rst b/docs/python/inference/tutorial.rst index d00a378cfe..fccca9cbd1 100644 --- a/docs/python/inference/tutorial.rst +++ b/docs/python/inference/tutorial.rst @@ -82,7 +82,7 @@ for this machine learning model. import numpy import onnxruntime as rt - sess = rt.InferenceSession("logreg_iris.onnx") + sess = rt.InferenceSession("logreg_iris.onnx", providers=rt.get_available_providers()) input_name = sess.get_inputs()[0].name pred_onx = sess.run(None, {input_name: X_test.astype(numpy.float32)})[0] print(pred_onx) @@ -97,7 +97,7 @@ by specifying its name into a list. import numpy import onnxruntime as rt - sess = rt.InferenceSession("logreg_iris.onnx") + sess = rt.InferenceSession("logreg_iris.onnx", providers=rt.get_available_providers()) input_name = sess.get_inputs()[0].name label_name = sess.get_outputs()[0].name pred_onx = sess.run([label_name], {input_name: X_test.astype(numpy.float32)})[0] diff --git a/onnxruntime/test/python/onnxruntime_test_python_keras.py b/onnxruntime/test/python/onnxruntime_test_python_keras.py index e2c4f2390d..02e7cdb8e7 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_keras.py +++ b/onnxruntime/test/python/onnxruntime_test_python_keras.py @@ -68,7 +68,7 @@ class TestInferenceSessionKeras(unittest.TestCase): # runtime content = converted_model.SerializeToString() - rt = onnxrt.InferenceSession(content) + rt = onnxrt.InferenceSession(content, providers=onnxrt.get_available_providers()) input = {rt.get_inputs()[0].name: x} actual_rt = rt.run(None, input) self.assertEqual(len(actual_rt), 1) diff --git a/orttraining/tools/scripts/layer_norm_transform.py b/orttraining/tools/scripts/layer_norm_transform.py index 15b2b4ae07..6355118709 100644 --- a/orttraining/tools/scripts/layer_norm_transform.py +++ b/orttraining/tools/scripts/layer_norm_transform.py @@ -163,11 +163,11 @@ def main(): input_mask = np.ones((batch, sq_length), dtype=np.int64) # Do forward using the original model. - sess = ort.InferenceSession(model_file_path) + sess = ort.InferenceSession(model_file_path, providers=ort.get_available_providers()) result = sess.run(None, {'input1': input_ids, 'input2': segment_ids, 'input3': input_mask}) # Do forward using the new model. - new_sess = ort.InferenceSession(new_model_file_path) + new_sess = ort.InferenceSession(new_model_file_path, providers=ort.get_available_providers()) new_result = new_sess.run(None, {'input1': input_ids, 'input2': segment_ids, 'input3': input_mask}) # Compare the outcomes from the two models. diff --git a/orttraining/tools/scripts/model_transform.py b/orttraining/tools/scripts/model_transform.py index 26424db66d..de23df13a1 100644 --- a/orttraining/tools/scripts/model_transform.py +++ b/orttraining/tools/scripts/model_transform.py @@ -298,11 +298,11 @@ segment_ids = np.random.randint(low=0, high=2, size=(batch, sq_length), dtype=np input_mask = np.ones((batch, sq_length), dtype=np.int64) # Do forward using the original model. -sess = ort.InferenceSession(input_model_name) +sess = ort.InferenceSession(input_model_name, providers=ort.get_available_providers()) result = sess.run(None, {'input1': input_ids, 'input2': segment_ids, 'input3': input_mask}) # Do forward using the new model. -new_sess = ort.InferenceSession(output_model_name) +new_sess = ort.InferenceSession(output_model_name, providers=ort.get_available_providers()) new_result = new_sess.run(None, {'input1': input_ids, 'input2': segment_ids, 'input3': input_mask}) # Compare the outcomes from the two models. diff --git a/orttraining/tools/scripts/nv_run_pretraining.py b/orttraining/tools/scripts/nv_run_pretraining.py index 1b3ec4a247..c7c03be161 100644 --- a/orttraining/tools/scripts/nv_run_pretraining.py +++ b/orttraining/tools/scripts/nv_run_pretraining.py @@ -528,7 +528,7 @@ def main(): is_model_exported = False import onnxruntime as ort - sess = ort.InferenceSession(onnx_path) + sess = ort.InferenceSession(onnx_path, providers=ort.get_available_providers()) result = sess.run(None, {'input1': input_ids.cpu().numpy(), 'input2': segment_ids.cpu().numpy(), 'input3': input_mask.cpu().numpy()}) print('---ORT result---') From c161813217d9bb17c52611e284f6956a7b30de37 Mon Sep 17 00:00:00 2001 From: Hariharan Seshadri Date: Tue, 30 Nov 2021 19:15:45 -0800 Subject: [PATCH 3/8] Misc InstanceNorm CUDA kernel changes (#9879) --- .../core/providers/cuda/nn/instance_norm.cc | 154 +++++++++++++++++- .../providers/cuda/nn/instance_norm_impl.cu | 43 ++--- .../providers/cuda/nn/instance_norm_impl.h | 14 +- .../providers/cpu/nn/instance_norm_op_test.cc | 149 ++++++++++++++++- 4 files changed, 321 insertions(+), 39 deletions(-) diff --git a/onnxruntime/core/providers/cuda/nn/instance_norm.cc b/onnxruntime/core/providers/cuda/nn/instance_norm.cc index c40c27cdf1..da51ba8f90 100644 --- a/onnxruntime/core/providers/cuda/nn/instance_norm.cc +++ b/onnxruntime/core/providers/cuda/nn/instance_norm.cc @@ -5,6 +5,7 @@ #include "instance_norm_impl.h" #include "core/providers/cpu/nn/instance_norm_helper.h" #include "core/providers/cpu/nn/batch_norm_helper.h" +#include "core/providers/cuda/math/unary_elementwise_ops_impl.h" namespace onnxruntime { namespace cuda { @@ -45,10 +46,10 @@ Status InstanceNorm::ComputeInternal(OpKernelContext* p_op_kernel_context) co const TensorShape& x_shape = X->Shape(); Tensor* Y = p_op_kernel_context->Output(0, x_shape); - auto y_data = reinterpret_cast(Y->template MutableData()); - auto x_data = reinterpret_cast(X->template Data()); - auto scale_data = reinterpret_cast(scale->template Data()); - auto bias_data = reinterpret_cast(bias->template Data()); + auto* y_data = reinterpret_cast(Y->template MutableData()); + const auto* x_data = reinterpret_cast(X->template Data()); + const auto* scale_data = reinterpret_cast(scale->template Data()); + const auto* bias_data = reinterpret_cast(bias->template Data()); const auto& x_dims = x_shape.GetDims(); const int64_t N = x_dims[0]; @@ -160,5 +161,150 @@ Status InstanceNorm::ComputeInternal(OpKernelContext* p_op_kernel_context) co return Status::OK(); } +template <> +Status InstanceNorm::ComputeInternal(OpKernelContext* p_op_kernel_context) const { + typedef typename ToCudaType::MappedType CudaT; + + const Tensor* X = p_op_kernel_context->Input(0); + const Tensor* scale = p_op_kernel_context->Input(1); + const Tensor* bias = p_op_kernel_context->Input(2); + + ORT_RETURN_IF_ERROR(InstanceNormHelper::ValidateInputs(X, scale, bias)); + + const TensorShape& x_shape = X->Shape(); + Tensor* Y = p_op_kernel_context->Output(0, x_shape); + + auto* y_data = reinterpret_cast(Y->template MutableData()); + const auto* x_data = reinterpret_cast(X->template Data()); + const auto* scale_data = reinterpret_cast(scale->template Data()); + const auto* bias_data = reinterpret_cast(bias->template Data()); + + const auto& x_dims = x_shape.GetDims(); + const int64_t N = x_dims[0]; + const int64_t C = x_dims[1]; + const auto one = Consts::One; + const auto zero = Consts::Zero; + + if (N == 1) { + // when N == 1, we can treat it as spatial batch normalization in training + // as the mean/variance would be computed from input + + CudnnTensor data_desc; + std::vector new_dims; + BatchNormHelper::NormalizeDims(x_shape, new_dims); + ORT_RETURN_IF_ERROR(data_desc.Set(new_dims, CudnnTensor::GetDataType())); + + CudnnTensor stats_desc; + ORT_RETURN_IF_ERROR(stats_desc.Set(data_desc, CUDNN_BATCHNORM_SPATIAL)); + + // For half input data type, alpha, beta, scale, bias need to be float type. + // alpha, beta will be of type float as the Consts struct specialization + // for MLFloat16 type take care of that. Only Convert the scale, bias to float) + + auto scale_data_fp32 = GetScratchBuffer(C); + Impl_Cast(Stream(), scale_data, scale_data_fp32.get(), C); + + auto bias_data_fp32 = GetScratchBuffer(C); + Impl_Cast(Stream(), bias_data, bias_data_fp32.get(), C); + + CUDNN_RETURN_IF_ERROR(cudnnBatchNormalizationForwardTraining( + CudnnHandle(), + CUDNN_BATCHNORM_SPATIAL, + &one, + &zero, + data_desc, + x_data, + data_desc, + y_data, + stats_desc, + scale_data_fp32.get(), + bias_data_fp32.get(), + 1.0f, + nullptr, + nullptr, + epsilon_, + nullptr, + nullptr)); + } else { + // we use cudnnBatchNormalizationForwardTraining to compute mean/variance + // so collapsing NC into channel + + auto input_count = x_shape.Size(); // N * C * H * W + auto stats_count = x_shape.SizeToDimension(2); // N * C + auto image_size = input_count / stats_count; + + CudnnTensor data_desc; + ORT_RETURN_IF_ERROR(data_desc.Set(std::array{1, stats_count, image_size, 1}, + CudnnTensor::GetDataType())); + + // stats_desc needs to be of 'float' type even for float16 input as the "stats" are of float type + CudnnTensor stats_desc; + ORT_RETURN_IF_ERROR(stats_desc.Set(std::array{1, stats_count, 1, 1}, + CudnnTensor::GetDataType())); + + // For half input data type, we need to allocate some "intermediate" + // float buffers for CuDNN to use. + const size_t stats_byte_count = stats_count * sizeof(float); + + // Mean & Variance are inputs & outputs and must be initialized to zero to work properly + auto mean = GetScratchBuffer(stats_count); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(mean.get(), 0, stats_byte_count, Stream())); + auto variance = GetScratchBuffer(stats_count); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(variance.get(), 0, stats_byte_count, Stream())); + + // We must set the scale & bias inputs to zero as they are inputs to the calculation + auto unused_scale = GetScratchBuffer(stats_count); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(unused_scale.get(), 0, stats_byte_count, Stream())); + auto unused_bias = GetScratchBuffer(stats_count); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(unused_bias.get(), 0, stats_byte_count, Stream())); + + // first, compute mean and variance per-instance per-channel using cudnnBatchNorm training + CUDNN_RETURN_IF_ERROR(cudnnBatchNormalizationForwardTraining( + CudnnHandle(), + CUDNN_BATCHNORM_SPATIAL, + &one, + &zero, + data_desc, + x_data, + data_desc, + y_data, // use y temporarily, would be rewritten later + stats_desc, + unused_scale.get(), + unused_bias.get(), + 1.0f, + mean.get(), + variance.get(), + CUDNN_BN_MIN_EPSILON, + nullptr, + nullptr)); + + // Y = scale * (x - mean) / sqrt (variance + epsilon) + B + // X/Y is (N,C,H,W) + // scale/bias is (1,C,1,1) + // mean/stddev is (N,C,1,1) + // NOTE cudnnBatchNormalization computes unbiased variance sum((Xi - mean)^2) / (count - 1) + // and it needs to be corrected with (count - 1) / count + fast_divmod fdm_HW(gsl::narrow_cast(image_size)); + fast_divmod fdm_C(gsl::narrow_cast(C)); + + // The InstanceNormImpl kernel handles the mean/variance in float32, so no casting required here + InstanceNormImpl( + Stream(), + x_data, + scale_data, + bias_data, + mean.get(), + variance.get(), + (image_size - 1.0) / image_size, + static_cast(epsilon_), + fdm_HW, + fdm_C, + y_data, + input_count); + } + + return Status::OK(); +} + } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/nn/instance_norm_impl.cu b/onnxruntime/core/providers/cuda/nn/instance_norm_impl.cu index c0af3d0580..057c301dbd 100644 --- a/onnxruntime/core/providers/cuda/nn/instance_norm_impl.cu +++ b/onnxruntime/core/providers/cuda/nn/instance_norm_impl.cu @@ -7,18 +7,18 @@ namespace onnxruntime { namespace cuda { -template +template __global__ void _InstanceNormKernel( - const T* input_data, - const T* scale, - const T* bias, - const T* mean, - const T* variance, + const T1* __restrict__ input_data, + const T1* __restrict__ scale, + const T1* __restrict__ bias, + const T2* __restrict__ mean, + const T2* __restrict__ variance, const double variance_correction, const double epsilon, const fast_divmod fdm_HW, const fast_divmod fdm_C, - T* output_data, + T1* __restrict__ output_data, const CUDA_LONG N) { CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); int nc = fdm_HW.div(id); @@ -26,34 +26,35 @@ __global__ void _InstanceNormKernel( fdm_C.divmod(nc, n, c); // Y = scale * (x - mean) / sqrt (std * std + epsilon) + B - output_data[id] = scale[c] * (input_data[id] - mean[nc]) / _Sqrt(variance[nc] * (T)variance_correction + (T)epsilon) + bias[c]; + output_data[id] = scale[c] * (input_data[id] - (T1)mean[nc]) / _Sqrt((T1)variance[nc] * (T1)variance_correction + (T1)epsilon) + bias[c]; } -template +template void InstanceNormImpl( cudaStream_t stream, - const T* input_data, - const T* scale, - const T* bias, - const T* mean, - const T* variance, + const T1* input_data, + const T1* scale, + const T1* bias, + const T2* mean, + const T2* variance, const double variance_correction, const double epsilon, const fast_divmod& fdm_HW, const fast_divmod& fdm_C, - T* output_data, + T1* output_data, size_t N) { int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _InstanceNormKernel<<>>( + _InstanceNormKernel<<>>( input_data, scale, bias, mean, variance, variance_correction, epsilon, fdm_HW, fdm_C, output_data, (CUDA_LONG)N); } -#define SPECIALIZED_IMPL(T) \ - template void InstanceNormImpl(cudaStream_t stream, const T* input_data, const T* scale, const T* bias, const T* mean, const T* stddev, const double variance_correction, const double epsilon, const fast_divmod& fdm_HW, const fast_divmod& fdm_C, T* output_data, size_t count); +#define SPECIALIZED_IMPL(T1, T2) \ + template void InstanceNormImpl(cudaStream_t stream, const T1* input_data, const T1* scale, const T1* bias, const T2* mean, const T2* stddev, const double variance_correction, const double epsilon, const fast_divmod& fdm_HW, const fast_divmod& fdm_C, T1* output_data, size_t count); -SPECIALIZED_IMPL(float) -SPECIALIZED_IMPL(double) -SPECIALIZED_IMPL(half) +SPECIALIZED_IMPL(float, float) +SPECIALIZED_IMPL(double, double) +// When the input data type is float16, the means and variances will flow in as float32 (special case) +SPECIALIZED_IMPL(half, float) } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/nn/instance_norm_impl.h b/onnxruntime/core/providers/cuda/nn/instance_norm_impl.h index cda9684416..35d754b297 100644 --- a/onnxruntime/core/providers/cuda/nn/instance_norm_impl.h +++ b/onnxruntime/core/providers/cuda/nn/instance_norm_impl.h @@ -6,19 +6,19 @@ namespace onnxruntime { namespace cuda { -template +template void InstanceNormImpl( cudaStream_t stream, - const T* input_data, - const T* scale, - const T* bias, - const T* mean, - const T* variance, + const T1* input_data, + const T1* scale, + const T1* bias, + const T2* mean, + const T2* variance, const double variance_correction, const double epsilon, const fast_divmod& fdm_HW, const fast_divmod& fdm_C, - T* output_data, + T1* output_data, size_t count); } // namespace cuda diff --git a/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc index 17afa96bd8..45c8ed74f6 100644 --- a/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc @@ -41,11 +41,11 @@ TEST(InstanceNormalizationOpTest, InstanceNorm) { -0.14644464F, -0.82262872F, -0.66852817F, 1.63760153F, -1.65898662F, 0.27618144F, 0.64840618F, 0.734399F}; test.AddOutput("Y", input_dims, expected_output); -#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug +#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); #else test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); -#endif +#endif } TEST(InstanceNormalizationOpTest, InstanceNormBatch1) { @@ -58,12 +58,10 @@ TEST(InstanceNormalizationOpTest, InstanceNormBatch1) { vector input_dims = {1, 3, 4}; test.AddInput("input", input_dims, input); - // vector scale = {2.1F, 0.1F, 1.F}; vector scale = {1.0F, 1.0F, 1.F}; vector scale_dims = {3}; test.AddInput("scale", scale_dims, scale); - // vector B = {2.3F, 1.5F, 0.F}; vector B = {0.0F, 0.0F, 0.F}; vector B_dims = {3}; test.AddInput("B", B_dims, B); @@ -72,13 +70,150 @@ TEST(InstanceNormalizationOpTest, InstanceNormBatch1) { 1.46688162F, -0.98600774F, -0.79911913F, 0.31824524F, 0.57370438F, 0.42193634F, 0.6525492F, -1.64818992F}; test.AddOutput("Y", input_dims, expected_output); -#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug + +#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); #else test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); -#endif +#endif } +TEST(InstanceNormalizationOpTest, InstanceNormBatch2) { + OpTester test("InstanceNormalization"); + test.AddAttribute("epsilon", 0.3F); + + vector input = {3.1513367F, 9.283596F, 1.4546119F, 5.4617004F, + 8.519701F, 1.2382338F, 1.7930176F, 5.1099434F, + 7.9195533F, 7.638727F, 8.065445F, 3.8082376F, + + 3.1513367F, 9.283596F, 1.4546119F, 5.4617004F, + 8.519701F, 1.2382338F, 1.7930176F, 5.1099434F, + 7.9195533F, 7.638727F, 8.065445F, 3.8082376F}; + vector input_dims = {2, 3, 4}; + test.AddInput("input", input_dims, input); + + vector scale = {1.0F, 1.0F, 1.F}; + vector scale_dims = {3}; + test.AddInput("scale", scale_dims, scale); + + vector B = {0.0F, 0.0F, 0.F}; + vector B_dims = {3}; + test.AddInput("B", B_dims, B); + + vector expected_output = {-0.56495477F, 1.48930046F, -1.13334329F, 0.20899761F, + 1.46688162F, -0.98600774F, -0.79911913F, 0.31824524F, + 0.57370438F, 0.42193634F, 0.6525492F, -1.64818992F, + + -0.56495477F, 1.48930046F, -1.13334329F, 0.20899761F, + 1.46688162F, -0.98600774F, -0.79911913F, 0.31824524F, + 0.57370438F, 0.42193634F, 0.6525492F, -1.64818992F}; + + test.AddOutput("Y", input_dims, expected_output); +#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); +#else + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +#endif +} + +// Only CUDA kernel has float 16 support +#ifdef USE_CUDA + +TEST(InstanceNormalizationOpTest, InstanceNormBatch1_fp16) { + OpTester test("InstanceNormalization"); + test.AddAttribute("epsilon", 0.3F); + + vector input = {3.1513367F, 9.283596F, 1.4546119F, 5.4617004F, + 8.519701F, 1.2382338F, 1.7930176F, 5.1099434F, + 7.9195533F, 7.638727F, 8.065445F, 3.8082376F}; + vector input_dims = {1, 3, 4}; + + vector scale = {1.0F, 1.0F, 1.F}; + vector scale_dims = {3}; + + vector B = {0.0F, 0.0F, 0.F}; + vector B_dims = {3}; + + vector expected_output = {-0.56495477F, 1.48930046F, -1.13334329F, 0.20899761F, + 1.46688162F, -0.98600774F, -0.79911913F, 0.31824524F, + 0.57370438F, 0.42193634F, 0.6525492F, -1.64818992F}; + + constexpr size_t input_size = 1 * 3 * 4; + + vector input_fp16(input_size); + vector scale_fp16(3); + vector B_fp16(3); + vector expected_output_fp16(input_size); + + ConvertFloatToMLFloat16(input.data(), input_fp16.data(), input_size); + ConvertFloatToMLFloat16(scale.data(), scale_fp16.data(), 3); + ConvertFloatToMLFloat16(B.data(), B_fp16.data(), 3); + ConvertFloatToMLFloat16(expected_output.data(), expected_output_fp16.data(), input_size); + + test.AddInput("X", input_dims, input_fp16); + test.AddInput("scale", {3}, scale_fp16); + test.AddInput("B", {3}, B_fp16); + test.AddOutput("Y", input_dims, expected_output_fp16); + +#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); +#else + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +#endif +} + +TEST(InstanceNormalizationOpTest, InstanceNormBatch2_fp16) { + OpTester test("InstanceNormalization"); + test.AddAttribute("epsilon", 0.3F); + + vector input = {3.1513367F, 9.283596F, 1.4546119F, 5.4617004F, + 8.519701F, 1.2382338F, 1.7930176F, 5.1099434F, + 7.9195533F, 7.638727F, 8.065445F, 3.8082376F, + + 3.1513367F, 9.283596F, 1.4546119F, 5.4617004F, + 8.519701F, 1.2382338F, 1.7930176F, 5.1099434F, + 7.9195533F, 7.638727F, 8.065445F, 3.8082376F}; + vector input_dims = {2, 3, 4}; + + vector scale = {1.0F, 1.0F, 1.F}; + vector scale_dims = {3}; + + vector B = {0.0F, 0.0F, 0.F}; + vector B_dims = {3}; + + vector expected_output = {-0.56495477F, 1.48930046F, -1.13334329F, 0.20899761F, + 1.46688162F, -0.98600774F, -0.79911913F, 0.31824524F, + 0.57370438F, 0.42193634F, 0.6525492F, -1.64818992F, + + -0.56495477F, 1.48930046F, -1.13334329F, 0.20899761F, + 1.46688162F, -0.98600774F, -0.79911913F, 0.31824524F, + 0.57370438F, 0.42193634F, 0.6525492F, -1.64818992F}; + + constexpr size_t input_size = 2 * 3 * 4; + + vector input_fp16(input_size); + vector scale_fp16(3); + vector B_fp16(3); + vector expected_output_fp16(input_size); + + ConvertFloatToMLFloat16(input.data(), input_fp16.data(), input_size); + ConvertFloatToMLFloat16(scale.data(), scale_fp16.data(), 3); + ConvertFloatToMLFloat16(B.data(), B_fp16.data(), 3); + ConvertFloatToMLFloat16(expected_output.data(), expected_output_fp16.data(), input_size); + + test.AddInput("X", input_dims, input_fp16); + test.AddInput("scale", {3}, scale_fp16); + test.AddInput("B", {3}, B_fp16); + test.AddOutput("Y", input_dims, expected_output_fp16); + +#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); +#else + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +#endif +} + +#endif TEST(InstanceNormalizationOpTest, InstanceNorm_2) { OpTester test("InstanceNormalization"); test.AddAttribute("epsilon", 0.3F); @@ -119,7 +254,7 @@ TEST(InstanceNormalizationOpTest, InstanceNorm_2) { 1.88028F, 2.353724F, -0.25549555F, 2.0837004F, 2.8466992F, 2.0773761F}; test.AddOutput("Y", input_dims, expected_output); -#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug +#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); #else test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); From 175acf08f470db0bb2e4b8eefe55cdeb87c8b132 Mon Sep 17 00:00:00 2001 From: Sherlock Date: Tue, 30 Nov 2021 21:17:32 -0800 Subject: [PATCH 4/8] ScatterND supports negative indices (#9739) * ScatterND supports negative indices --- .../core/providers/cpu/tensor/scatter_nd.cc | 17 +++++++--- .../providers/cuda/tensor/scatter_nd_impl.cu | 16 +++++++--- .../cpu/tensor/scatter_nd_op_test.cc | 31 +++++++++++++++++-- .../python/orttraining_test_ortmodule_api.py | 28 +++++++++++++++++ 4 files changed, 81 insertions(+), 11 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/scatter_nd.cc b/onnxruntime/core/providers/cpu/tensor/scatter_nd.cc index 2b3ee69561..b5a19fd44c 100644 --- a/onnxruntime/core/providers/cpu/tensor/scatter_nd.cc +++ b/onnxruntime/core/providers/cpu/tensor/scatter_nd.cc @@ -131,7 +131,6 @@ Status ScatterNDBase::PrepareForCompute(OpKernelContext* context, Prepare& p) co element_counts[i] = input_strides[i]; } - int64_t err_indice = 0; p.element_bytes = input_tensor->DataType()->Size(); p.element_to_copy = input_shape.SizeFromDimension(last_indice_dimension); p.bytes_to_copy = p.element_bytes * p.element_to_copy; @@ -150,13 +149,23 @@ Status ScatterNDBase::PrepareForCompute(OpKernelContext* context, Prepare& p) co for (int64_t i = 0; i < offset_count; ++i) { for (int64_t j = 0; j < last_indice_dimension; ++j) { auto indice = *(indice_offset + i * last_indice_dimension + j); - if (indice < 0 || indice >= input_shape[j]) { - err_indice = indice; + + if (indice >= 0) { + if (indice >= input_shape[j]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "invalid indice found, indice = ", indice); + } + } else { + if (indice < -input_shape[j]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "invalid indice found, indice = ", indice); + } else { + indice += input_shape[j]; + } } + p.element_offsets[i] += indice * element_counts[j]; } } - return err_indice == 0 ? Status::OK() : ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "invalid indice found, indice = ", err_indice); + return Status::OK(); } Status ScatterND::Compute(OpKernelContext* context) const { diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.cu b/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.cu index 0651049a5f..e9199b5e1b 100644 --- a/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.cu @@ -34,11 +34,19 @@ __global__ void _ScatterNDKernel( // This would have been an error in the CPU kernel, but throwing in the CUDA EP // is hard. This is the approach taken by other frameworks for out of bound indices // in their corresponding GPU backends as well. - if (index < 0) - index = 0; + // index >= -dim_value && index < dim_value - else if (index >= dim_value) - index = dim_value - 1; + if (index >= 0) { + if (index >= dim_value) { + index = dim_value - 1; + } + } else { + if (index < -dim_value) { + index = 0; + } else { + index += dim_value; + } + } data_offset += (index * element_count_dim); } diff --git a/onnxruntime/test/providers/cpu/tensor/scatter_nd_op_test.cc b/onnxruntime/test/providers/cpu/tensor/scatter_nd_op_test.cc index 145fbacdb3..a3d943ea78 100644 --- a/onnxruntime/test/providers/cpu/tensor/scatter_nd_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/scatter_nd_op_test.cc @@ -39,6 +39,15 @@ TEST(ScatterNDOpTest, ScatterND_matrice_int64_int64) { test.Run(); } +TEST(ScatterNDOpTest, ScatterND_matrice_int64_int64_neg_indices) { + OpTester test("ScatterND", 11); + test.AddInput ("data", {2,2}, {1LL,1LL,2LL,2LL}); + test.AddInput ("indices", {2,2}, {0LL,0LL,-1LL,-1LL}); + test.AddInput("updates", {2}, {0LL,3LL}); + test.AddOutput("output", {2,2}, {0LL,1LL,2LL,3LL}); + test.Run(); +} + TEST(ScatterNDOpTest, ScatterND_matrice_string_int64) { OpTester test1("ScatterND", 11); test1.AddInput("data", {2,2,2}, {"egg","dance","bob","air","smart","terry","laugh","kite"}); @@ -55,6 +64,22 @@ TEST(ScatterNDOpTest, ScatterND_matrice_string_int64) { test2.Run(); } +TEST(ScatterNDOpTest, ScatterND_matrice_string_int64_neg_indices) { + OpTester test1("ScatterND", 11); + test1.AddInput("data", {2,2,2}, {"egg","dance","bob","air","smart","terry","laugh","kite"}); + test1.AddInput("indices", {2,1,2}, {0,-1,-1,0}); + test1.AddInput("updates", {2,1,2}, {"air","bob","terry","smart"}); + test1.AddOutput("output", {2,2,2}, {"egg","dance","air","bob","terry","smart","laugh","kite"}); + test1.Run(); + + OpTester test2("ScatterND", 11); + test2.AddInput("data", {3,3}, {"egg","","air","","terry","smart","laugh","","hop"}); + test2.AddInput("indices", {3,2}, {-1,-2,1,0,0,-2}); + test2.AddInput("updates", {3}, {"kite","bob","dance"}); + test2.AddOutput("output", {3,3}, {"egg","dance","air","bob","terry","smart","laugh","kite","hop"}); + test2.Run(); +} + TEST(ScatterNDOpTest, ScatterND_slice_float_int64_t) { OpTester test("ScatterND", 11); test.AddInput("data", {2,2}, {0.0f,0.1f,0.1f,0.1f}); @@ -76,14 +101,14 @@ TEST(ScatterNDOpTest, ScatterND_slice_double_int64_t) { TEST(ScatterNDOpTest, ScatterND_3tensor_int64) { OpTester test1("ScatterND", 11); test1.AddInput("data", {2,2,2}, {0LL,1LL,1LL,1LL,1LL,1LL,6LL,7LL}); - test1.AddInput("indices", {2,2}, {0LL,1LL,1LL,0LL}); + test1.AddInput("indices", {2,2}, {0LL,1LL,-1LL,0LL}); test1.AddInput("updates", {2,2}, {2LL,3LL,4LL,5LL}); test1.AddOutput("output", {2,2,2}, {0LL,1LL,2LL,3LL,4LL,5LL,6LL,7LL}); test1.Run(); OpTester test2("ScatterND", 11); test2.AddInput("data", {2,2,2}, {0,0,2,3,4,0,6,7}); - test2.AddInput("indices", {2,3}, {0,0,1,1,0,1}); + test2.AddInput("indices", {2,3}, {0,0,1,-1,0,-1}); test2.AddInput("updates", {2}, {1,5}); test2.AddOutput("output", {2,2,2}, {0,1,2,3,4,5,6,7}); test2.Run(); @@ -142,7 +167,7 @@ TEST(ScatterNDOpTest, ScatterND_batched_3tensor_int64) { OpTester test2("ScatterND", 11); test2.AddInput("data", {2,2,2}, {0,0,2,0,4,0,0,7}); - test2.AddInput("indices", {2,2,3}, {0,0,1,1,0,1,0,1,1,1,1,0}); + test2.AddInput("indices", {2,2,3}, {0,0,-1,-1,0,-1,0,1,-1,1,-1,0}); test2.AddInput("updates", {2,2}, {1,5,3,6}); test2.AddOutput("output", {2,2,2}, {0,1,2,3,4,5,6,7}); test2.Run(); diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py index 121775d695..7bb0e675b6 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py @@ -658,6 +658,34 @@ def test_gradient_correctness(): _test_helpers.assert_values_are_close(ort_prediction, pt_prediction) _test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model) +@pytest.mark.parametrize("device", ['cpu', 'cuda']) +@pytest.mark.parametrize("indices", ([[ 2, 3, -1, -1],[0, 1, -1, -1]], + [[ 2, 3, 4, 4],[ 0, 1, 4, 4]])) +def test_scatternd_correctness(device, indices): + class NeuralNetScatterND(torch.nn.Module): + def __init__(self): + super(NeuralNetScatterND, self).__init__() + + def forward(self, rerouted_output, dispatch_mask, expert_output): + rerouted_output[dispatch_mask] = expert_output + return rerouted_output + + pt_model = NeuralNetScatterND().to(device) + ort_model = ORTModule(copy.deepcopy(pt_model)) + + def run_step(model, rerouted_output, dispatch_mask, expert_output): + prediction = model(rerouted_output, dispatch_mask, expert_output) + return prediction + + rerouted_output = torch.tensor([[0.],[0.],[0.],[0.],[0.]], device=device) + dispatch_mask = torch.tensor(indices, device=device) + expert_output = torch.tensor([[[0.3817],[0.9625],[0.9625],[0.9625]],[[0.3817],[0.9625],[0.9625],[0.9625]]], device=device) + + pt_prediction = run_step(pt_model, rerouted_output, dispatch_mask, expert_output) + ort_prediction = run_step(ort_model, rerouted_output, dispatch_mask, expert_output) + _test_helpers.assert_values_are_close(ort_prediction, pt_prediction, atol=1e-5) + + @pytest.mark.parametrize("use_fp16", [False, True]) @pytest.mark.parametrize("input_requires_grad", [False, True]) def test_gradient_correctness_conv1d(use_fp16, input_requires_grad): From 078782ea3c27b3d1e26bf621fe2ffb3ca012d439 Mon Sep 17 00:00:00 2001 From: RandySheriffH <48490400+RandySheriffH@users.noreply.github.com> Date: Wed, 1 Dec 2021 15:50:36 -0800 Subject: [PATCH 5/8] exclude test case from reduced-op build (#9895) Co-authored-by: Randy Shuai --- onnxruntime/test/shared_lib/test_inference.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index 73e436263c..87dd31c74a 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -1887,6 +1887,7 @@ TEST(CApiTest, TestPerSessionCustomThreadPoolHooks) { // Preventing resize tranformer issue: // https://github.com/microsoft/onnxruntime/issues/9857 +#ifndef REDUCED_OPS_BUILD TEST(CApiTest, crop_and_resize) { std::vector input_value_0; input_value_0.resize(2 * 36 * 36 * 3); @@ -1921,6 +1922,7 @@ TEST(CApiTest, crop_and_resize) { ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, output_type_shape.GetElementType()); ASSERT_EQ(output_shape, output_type_shape.GetShape()); } +#endif } // namespace TestPerSessionCustomThreadHooks #endif From 00c979db4db061ab7d9836d6a3d8f797087a3f15 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Thu, 2 Dec 2021 13:51:22 +1000 Subject: [PATCH 6/8] Update doc for operators/opsets supported by mobile package (#9899) --- docs/ORTMobilePackageOperatorTypeSupport.md | 189 ++++++++++---------- 1 file changed, 97 insertions(+), 92 deletions(-) diff --git a/docs/ORTMobilePackageOperatorTypeSupport.md b/docs/ORTMobilePackageOperatorTypeSupport.md index 7e08e06890..09de5d9d4c 100644 --- a/docs/ORTMobilePackageOperatorTypeSupport.md +++ b/docs/ORTMobilePackageOperatorTypeSupport.md @@ -17,104 +17,109 @@ NOTE: Operators used to manipulate dimensions and indices will support int32 and |Operator|Opsets| |--------|------| |**ai.onnx**|| -|ai.onnx:Abs|12, 13| -|ai.onnx:Add|12, 13| -|ai.onnx:And|12, 13| -|ai.onnx:ArgMax|12, 13| -|ai.onnx:ArgMin|12, 13| -|ai.onnx:AveragePool|12, 13| -|ai.onnx:Cast|12, 13| -|ai.onnx:Ceil|12, 13| -|ai.onnx:Clip|12, 13| -|ai.onnx:Concat|12, 13| -|ai.onnx:ConstantOfShape|12, 13| -|ai.onnx:Conv|12, 13| -|ai.onnx:ConvTranspose|12, 13| -|ai.onnx:Cos|12, 13| -|ai.onnx:CumSum|12, 13| -|ai.onnx:DepthToSpace|12, 13| -|ai.onnx:DequantizeLinear|12, 13| -|ai.onnx:Div|12, 13| -|ai.onnx:DynamicQuantizeLinear|12, 13| -|ai.onnx:Elu|12, 13| -|ai.onnx:Equal|12, 13| -|ai.onnx:Exp|12, 13| -|ai.onnx:Expand|12, 13| -|ai.onnx:Flatten|12, 13| -|ai.onnx:Floor|12, 13| -|ai.onnx:Gather|12, 13| -|ai.onnx:GatherND|12, 13| -|ai.onnx:Gemm|12, 13| -|ai.onnx:GlobalAveragePool|12, 13| -|ai.onnx:Greater|12, 13| -|ai.onnx:GreaterOrEqual|12, 13| -|ai.onnx:Identity|12, 13| -|ai.onnx:If|12, 13| -|ai.onnx:LRN|12, 13| -|ai.onnx:LeakyRelu|12, 13| -|ai.onnx:Less|12, 13| -|ai.onnx:LessOrEqual|12, 13| -|ai.onnx:Log|12, 13| -|ai.onnx:LogSoftmax|12, 13| -|ai.onnx:Loop|12, 13| -|ai.onnx:MatMul|12, 13| -|ai.onnx:MatMulInteger|12, 13| -|ai.onnx:Max|12, 13| -|ai.onnx:MaxPool|12, 13| -|ai.onnx:Mean|12, 13| -|ai.onnx:Min|12, 13| -|ai.onnx:Mul|12, 13| -|ai.onnx:Neg|12, 13| -|ai.onnx:NonMaxSuppression|12, 13| -|ai.onnx:NonZero|12, 13| -|ai.onnx:Not|12, 13| -|ai.onnx:Or|12, 13| -|ai.onnx:PRelu|12, 13| -|ai.onnx:Pad|12, 13| -|ai.onnx:Pow|12, 13| -|ai.onnx:QLinearConv|12, 13| -|ai.onnx:QLinearMatMul|12, 13| -|ai.onnx:QuantizeLinear|12, 13| -|ai.onnx:Range|12, 13| -|ai.onnx:Reciprocal|12, 13| -|ai.onnx:ReduceMax|12, 13| -|ai.onnx:ReduceMean|12, 13| -|ai.onnx:ReduceMin|12, 13| -|ai.onnx:ReduceProd|12, 13| -|ai.onnx:ReduceSum|12, 13| -|ai.onnx:Relu|12, 13| -|ai.onnx:Reshape|12, 13| -|ai.onnx:Resize|12, 13| -|ai.onnx:ReverseSequence|12, 13| -|ai.onnx:Round|12, 13| -|ai.onnx:Scan|12, 13| -|ai.onnx:ScatterND|12, 13| -|ai.onnx:Shape|12, 13| -|ai.onnx:Sigmoid|12, 13| -|ai.onnx:Sin|12, 13| -|ai.onnx:Size|12, 13| -|ai.onnx:Slice|12, 13| -|ai.onnx:Softmax|12, 13| -|ai.onnx:SpaceToDepth|12, 13| -|ai.onnx:Split|12, 13| -|ai.onnx:Sqrt|12, 13| -|ai.onnx:Squeeze|12, 13| -|ai.onnx:Sub|12, 13| -|ai.onnx:Sum|12, 13| -|ai.onnx:Tanh|12, 13| -|ai.onnx:ThresholdedRelu|12, 13| -|ai.onnx:Tile|12, 13| -|ai.onnx:TopK|12, 13| -|ai.onnx:Transpose|12, 13| -|ai.onnx:Unique|12, 13| -|ai.onnx:Unsqueeze|12, 13| -|ai.onnx:Where|12, 13| +|ai.onnx:Abs|12, 13, 14, 15| +|ai.onnx:Add|12, 13, 14, 15| +|ai.onnx:And|12, 13, 14, 15| +|ai.onnx:ArgMax|12, 13, 14, 15| +|ai.onnx:ArgMin|12, 13, 14, 15| +|ai.onnx:AveragePool|12, 13, 14, 15| +|ai.onnx:Cast|12, 13, 14, 15| +|ai.onnx:Ceil|12, 13, 14, 15| +|ai.onnx:Clip|12, 13, 14, 15| +|ai.onnx:Concat|12, 13, 14, 15| +|ai.onnx:ConstantOfShape|12, 13, 14, 15| +|ai.onnx:Conv|12, 13, 14, 15| +|ai.onnx:ConvTranspose|12, 13, 14, 15| +|ai.onnx:Cos|12, 13, 14, 15| +|ai.onnx:CumSum|12, 13, 14, 15| +|ai.onnx:DepthToSpace|12, 13, 14, 15| +|ai.onnx:DequantizeLinear|12, 13, 14, 15| +|ai.onnx:Div|12, 13, 14, 15| +|ai.onnx:DynamicQuantizeLinear|12, 13, 14, 15| +|ai.onnx:Elu|12, 13, 14, 15| +|ai.onnx:Equal|12, 13, 14, 15| +|ai.onnx:Erf|12, 13, 14, 15| +|ai.onnx:Exp|12, 13, 14, 15| +|ai.onnx:Expand|12, 13, 14, 15| +|ai.onnx:Flatten|12, 13, 14, 15| +|ai.onnx:Floor|12, 13, 14, 15| +|ai.onnx:Gather|12, 13, 14, 15| +|ai.onnx:GatherND|12, 13, 14, 15| +|ai.onnx:Gemm|12, 13, 14, 15| +|ai.onnx:GlobalAveragePool|12, 13, 14, 15| +|ai.onnx:Greater|12, 13, 14, 15| +|ai.onnx:GreaterOrEqual|12, 13, 14, 15| +|ai.onnx:HardSigmoid|12, 13, 14, 15| +|ai.onnx:Identity|12, 13, 14, 15| +|ai.onnx:If|12, 13, 14, 15| +|ai.onnx:InstanceNormalization|12, 13, 14, 15| +|ai.onnx:LRN|12, 13, 14, 15| +|ai.onnx:LayerNormalization|1| +|ai.onnx:LeakyRelu|12, 13, 14, 15| +|ai.onnx:Less|12, 13, 14, 15| +|ai.onnx:LessOrEqual|12, 13, 14, 15| +|ai.onnx:Log|12, 13, 14, 15| +|ai.onnx:LogSoftmax|12, 13, 14, 15| +|ai.onnx:Loop|12, 13, 14, 15| +|ai.onnx:MatMul|12, 13, 14, 15| +|ai.onnx:MatMulInteger|12, 13, 14, 15| +|ai.onnx:Max|12, 13, 14, 15| +|ai.onnx:MaxPool|12, 13, 14, 15| +|ai.onnx:Mean|12, 13, 14, 15| +|ai.onnx:Min|12, 13, 14, 15| +|ai.onnx:Mul|12, 13, 14, 15| +|ai.onnx:Neg|12, 13, 14, 15| +|ai.onnx:NonMaxSuppression|12, 13, 14, 15| +|ai.onnx:NonZero|12, 13, 14, 15| +|ai.onnx:Not|12, 13, 14, 15| +|ai.onnx:Or|12, 13, 14, 15| +|ai.onnx:PRelu|12, 13, 14, 15| +|ai.onnx:Pad|12, 13, 14, 15| +|ai.onnx:Pow|12, 13, 14, 15| +|ai.onnx:QLinearConv|12, 13, 14, 15| +|ai.onnx:QLinearMatMul|12, 13, 14, 15| +|ai.onnx:QuantizeLinear|12, 13, 14, 15| +|ai.onnx:Range|12, 13, 14, 15| +|ai.onnx:Reciprocal|12, 13, 14, 15| +|ai.onnx:ReduceMax|12, 13, 14, 15| +|ai.onnx:ReduceMean|12, 13, 14, 15| +|ai.onnx:ReduceMin|12, 13, 14, 15| +|ai.onnx:ReduceProd|12, 13, 14, 15| +|ai.onnx:ReduceSum|12, 13, 14, 15| +|ai.onnx:Relu|12, 13, 14, 15| +|ai.onnx:Reshape|12, 13, 14, 15| +|ai.onnx:Resize|12, 13, 14, 15| +|ai.onnx:ReverseSequence|12, 13, 14, 15| +|ai.onnx:Round|12, 13, 14, 15| +|ai.onnx:Scan|12, 13, 14, 15| +|ai.onnx:ScatterND|12, 13, 14, 15| +|ai.onnx:Shape|12, 13, 14, 15| +|ai.onnx:Sigmoid|12, 13, 14, 15| +|ai.onnx:Sin|12, 13, 14, 15| +|ai.onnx:Size|12, 13, 14, 15| +|ai.onnx:Slice|12, 13, 14, 15| +|ai.onnx:Softmax|12, 13, 14, 15| +|ai.onnx:SpaceToDepth|12, 13, 14, 15| +|ai.onnx:Split|12, 13, 14, 15| +|ai.onnx:Sqrt|12, 13, 14, 15| +|ai.onnx:Squeeze|12, 13, 14, 15| +|ai.onnx:Sub|12, 13, 14, 15| +|ai.onnx:Sum|12, 13, 14, 15| +|ai.onnx:Tanh|12, 13, 14, 15| +|ai.onnx:ThresholdedRelu|12, 13, 14, 15| +|ai.onnx:Tile|12, 13, 14, 15| +|ai.onnx:TopK|12, 13, 14, 15| +|ai.onnx:Transpose|12, 13, 14, 15| +|ai.onnx:Unique|12, 13, 14, 15| +|ai.onnx:Unsqueeze|12, 13, 14, 15| +|ai.onnx:Where|12, 13, 14, 15| ||| |**com.microsoft**|| |com.microsoft:DynamicQuantizeMatMul|1| |com.microsoft:FusedConv|1| |com.microsoft:FusedGemm|1| |com.microsoft:FusedMatMul|1| +|com.microsoft:Gelu|1| |com.microsoft:MatMulIntegerToFloat|1| |com.microsoft:NhwcMaxPool|1| |com.microsoft:QLinearAdd|1| From 8d88a6ac7f15cfb0f075fb420ec24680872cd60d Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Wed, 1 Dec 2021 22:28:52 -0800 Subject: [PATCH 7/8] add --amdgpu-target=gfx90a (#9820) --- cmake/onnxruntime_providers.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake index a82629bed4..e29dd8f075 100644 --- a/cmake/onnxruntime_providers.cmake +++ b/cmake/onnxruntime_providers.cmake @@ -1106,6 +1106,9 @@ if (onnxruntime_USE_ROCM) # Generate GPU code for GFX9 Generation list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906 --amdgpu-target=gfx908) + if (ROCM_VERSION_DEV_INT GREATER_EQUAL 50000) + list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx90a) + endif() #onnxruntime_add_shared_library_module(onnxruntime_providers_rocm ${onnxruntime_providers_rocm_src}) hip_add_library(onnxruntime_providers_rocm MODULE ${onnxruntime_providers_rocm_src}) From 3f5c1e1c58939c0de30e232ea0ef0c30644166b0 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Thu, 2 Dec 2021 18:43:34 +1000 Subject: [PATCH 8/8] Update to include the Xamarin targets for internal ORT builds so the managed nuget package is consistent as both CPU and GPU builds produce a package called Microsoft.ML.OnnxRuntime.Managed. (#9906) --- .../Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj index 46b30483a0..9f2bdf4f25 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj +++ b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj @@ -4,10 +4,10 @@ Microsoft.ML.OnnxRuntime - - + netstandard1.1;netstandard2.0;xamarinios10;monoandroid11.0;net5.0;netcoreapp3.1