From ca8ff8c91c3ae6f7587df3cea3bd82b5578613a9 Mon Sep 17 00:00:00 2001 From: Zhang Lei Date: Tue, 26 Nov 2019 11:35:28 -0800 Subject: [PATCH 01/13] Enable conv/conv_transpose for opset 11 in cuda execution provider. (#2401) Enable conv/conv_transpose and existing pooling for opset 11 in cuda execution provider. They are of spec dilates/strides change related cuda pooling ops for op set 11. --- .../providers/cuda/cuda_execution_provider.cc | 72 ++++++++++++------- onnxruntime/core/providers/cuda/nn/conv.cc | 12 +++- .../core/providers/cuda/nn/conv_transpose.cc | 12 +++- onnxruntime/core/providers/cuda/nn/pool.cc | 20 ++++-- 4 files changed, 84 insertions(+), 32 deletions(-) diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index dce0c23ab8..3d55b58446 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -389,12 +389,12 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, LRN); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, LRN); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, LRN); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, Conv); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, Conv); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Conv); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, ConvTranspose); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, ConvTranspose); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, ConvTranspose); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, Conv); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, Conv); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, Conv); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, ConvTranspose); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ConvTranspose); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ConvTranspose); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, float, AveragePool); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, double, AveragePool); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, MLFloat16, AveragePool); @@ -549,13 +549,13 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDom class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scan); // opset 10 -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, AveragePool); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, AveragePool); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, MLFloat16, AveragePool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, double, AveragePool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, MLFloat16, AveragePool); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, Dropout); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, MaxPool); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, MaxPool); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, MLFloat16, MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, double, MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, MLFloat16, MaxPool); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, NonMaxSuppression); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, Resize); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, Resize); @@ -640,6 +640,18 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, S class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Squeeze); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, TopK); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Unsqueeze); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Conv); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Conv); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Conv); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, ConvTranspose); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, ConvTranspose); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, ConvTranspose); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, AveragePool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, AveragePool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, AveragePool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, MaxPool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, MaxPool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, MaxPool); static void RegisterCudaKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { @@ -824,12 +836,12 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -983,13 +995,13 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, // opset 10 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -1075,6 +1087,18 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, }; for (auto& function_table_entry : function_table) { diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc index 1eb9643f30..01b3929d78 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.cc +++ b/onnxruntime/core/providers/cuda/nn/conv.cc @@ -9,11 +9,21 @@ namespace onnxruntime { namespace cuda { +// Op Set 11 for Conv only update document to clearify default dilations and strides value. +// which are already convered by op set 11 cpu versoin, so simply add declaration. #define REGISTER_KERNEL_TYPED(T) \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ + Conv, \ + kOnnxDomain, \ + 1, 10, \ + T, \ + kCudaExecutionProvider, \ + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + Conv); \ ONNX_OPERATOR_TYPED_KERNEL_EX( \ Conv, \ kOnnxDomain, \ - 1, \ + 11, \ T, \ kCudaExecutionProvider, \ KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc index 3e51d36223..92fcf8ac53 100644 --- a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc +++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc @@ -6,11 +6,21 @@ namespace onnxruntime { namespace cuda { +// Op Set 11 for ConvTranspose only update document to clearify default dilations and strides value. +// which are already covered by op set 11 cpu version, so simply add declaration. #define REGISTER_KERNEL_TYPED(T) \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ + ConvTranspose, \ + kOnnxDomain, \ + 1, 10, \ + T, \ + kCudaExecutionProvider, \ + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + ConvTranspose); \ ONNX_OPERATOR_TYPED_KERNEL_EX( \ ConvTranspose, \ kOnnxDomain, \ - 1, \ + 11, \ T, \ kCudaExecutionProvider, \ KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ diff --git a/onnxruntime/core/providers/cuda/nn/pool.cc b/onnxruntime/core/providers/cuda/nn/pool.cc index ae6508e581..8dbdeca70c 100644 --- a/onnxruntime/core/providers/cuda/nn/pool.cc +++ b/onnxruntime/core/providers/cuda/nn/pool.cc @@ -30,12 +30,17 @@ namespace cuda { KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()).TypeConstraint("I", DataTypeImpl::GetTensorType()), \ Pool); + POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 7, 9) POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 7, 9) POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 7, 9) -POOLING_KERNEL(AveragePool, float, AveragePool, 10) -POOLING_KERNEL(AveragePool, double, AveragePool, 10) -POOLING_KERNEL(AveragePool, MLFloat16, AveragePool, 10) +POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 10, 10) +POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 10, 10) +POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 10, 10) +//AveragePool and MaxPool op set 11 only update spec document on default value for dilations and strides. +POOLING_KERNEL(AveragePool, float, AveragePool, 11) +POOLING_KERNEL(AveragePool, double, AveragePool, 11) +POOLING_KERNEL(AveragePool, MLFloat16, AveragePool, 11) POOLING_KERNEL(GlobalAveragePool, float, AveragePool, 1) POOLING_KERNEL(GlobalAveragePool, double, AveragePool, 1) POOLING_KERNEL(GlobalAveragePool, MLFloat16, AveragePool, 1) @@ -45,9 +50,12 @@ POOLING_KERNEL_VERSIONED(MaxPool, MLFloat16, MaxPool<1>, 1, 7) POOLING_KERNEL_VERSIONED(MaxPool, float, MaxPool<8>, 8, 9) POOLING_KERNEL_VERSIONED(MaxPool, double, MaxPool<8>, 8, 9) POOLING_KERNEL_VERSIONED(MaxPool, MLFloat16, MaxPool<8>, 8, 9) -POOLING_KERNEL(MaxPool, float, MaxPool<8>, 10) -POOLING_KERNEL(MaxPool, double, MaxPool<8>, 10) -POOLING_KERNEL(MaxPool, MLFloat16, MaxPool<8>, 10) +POOLING_KERNEL_VERSIONED(MaxPool, float, MaxPool<8>, 10, 10) +POOLING_KERNEL_VERSIONED(MaxPool, double, MaxPool<8>, 10, 10) +POOLING_KERNEL_VERSIONED(MaxPool, MLFloat16, MaxPool<8>, 10, 10) +POOLING_KERNEL(MaxPool, float, MaxPool<8>, 11) +POOLING_KERNEL(MaxPool, double, MaxPool<8>, 11) +POOLING_KERNEL(MaxPool, MLFloat16, MaxPool<8>, 11) POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1) POOLING_KERNEL(GlobalMaxPool, double, MaxPool<1>, 1) POOLING_KERNEL(GlobalMaxPool, MLFloat16, MaxPool<1>, 1) From d6c84925d514a94d75ade49fb71a5f6f77885b0a Mon Sep 17 00:00:00 2001 From: Xiang Zhang Date: Tue, 26 Nov 2019 19:09:22 -0800 Subject: [PATCH 02/13] move logSessionCreation after session is initialized (#2481) --- onnxruntime/core/session/inference_session.cc | 13 ++++++++----- onnxruntime/core/session/inference_session.h | 1 + 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index e18aa82de3..84126926ce 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -240,11 +240,7 @@ common::Status InferenceSession::Load(std::functionIrVersion(), model_->ProducerName(), model_->ProducerVersion(), - model_->Domain(), model_->MainGraph().DomainToVersionMap(), model_->MainGraph().Name(), - model_->MetaData(), event_name, execution_providers_.GetIds()); + event_name_ = event_name; } catch (const std::exception& ex) { status = Status(common::ONNXRUNTIME, common::FAIL, "Exception during loading: " + std::string(ex.what())); @@ -633,6 +629,13 @@ common::Status InferenceSession::Initialize() { // handle any subgraphs ORT_RETURN_IF_ERROR_SESSIONID_(InitializeSubgraphSessions(graph, session_state_)); is_inited_ = true; + + // and log telemetry + const Env& env = Env::Default(); + env.GetTelemetryProvider().LogSessionCreation(session_id_, model_->IrVersion(), model_->ProducerName(), model_->ProducerVersion(), + model_->Domain(), model_->MainGraph().DomainToVersionMap(), model_->MainGraph().Name(), + model_->MetaData(), event_name_, execution_providers_.GetIds()); + LOGS(*session_logger_, INFO) << "Session successfully initialized."; } catch (const NotImplementedException& ex) { status = ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Exception during initialization: ", ex.what()); diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h index 791d61e62e..a25cb5ebc2 100644 --- a/onnxruntime/core/session/inference_session.h +++ b/onnxruntime/core/session/inference_session.h @@ -445,6 +445,7 @@ class InferenceSession { long long total_run_duration_since_last_; // the total duration (us) of Run() calls since the last report TimePoint time_sent_last_; // the TimePoint of the last report const long long kDurationBetweenSending = 1000* 1000 * 60 * 10; // duration in (us). send a report every 10 mins + std::string event_name_; // where the model is loaded from: ["model_loading_uri", "model_loading_proto", "model_loading_istream"] #ifdef ONNXRUNTIME_ENABLE_INSTRUMENT bool session_activity_started_ = false; From e29fb5cef1d0e8ad27a4ede6f37606a368f3682d Mon Sep 17 00:00:00 2001 From: Yulong Wang Date: Wed, 27 Nov 2019 00:32:26 -0800 Subject: [PATCH 03/13] Add BatchParallelFor, TryParallelFor, TryBatchParallelFor into ThreadPool (#2476) --- .../onnxruntime/core/platform/threadpool.h | 42 ++++++++ .../contrib_ops/cpu/crop_and_resize.cc | 19 +--- onnxruntime/core/common/threadpool.cc | 30 ++++++ .../cpu/object_detection/roialign.cc | 19 +--- onnxruntime/test/platform/threadpool_test.cc | 101 ++++++++++++++++++ 5 files changed, 177 insertions(+), 34 deletions(-) create mode 100644 onnxruntime/test/platform/threadpool_test.cc diff --git a/include/onnxruntime/core/platform/threadpool.h b/include/onnxruntime/core/platform/threadpool.h index 3337583612..e6b442348a 100644 --- a/include/onnxruntime/core/platform/threadpool.h +++ b/include/onnxruntime/core/platform/threadpool.h @@ -46,6 +46,11 @@ class ThreadPool { */ void ParallelFor(int32_t total, std::function fn); + /* + Schedule work in the interval [0, total), with calls split into (num_batches) batches. + */ + void BatchParallelFor(int32_t total, std::function fn, int32_t num_batches = 0); + /* Schedule work in the interval [first, last]. */ @@ -54,6 +59,43 @@ class ThreadPool { // This is not supported until the latest Eigen // void SetStealPartitions(const std::vector>& partitions); + /** + Tries to call the given function in parallel, with calls split into (num_batches) batches. + **/ + template + inline static void TryBatchParallelFor(concurrency::ThreadPool* tp, int32_t total, F&& fn, int32_t num_batches = 0) { + if (tp != nullptr) { + if (num_batches <= 0) { + num_batches = tp->NumThreads() + 1; + } + tp->BatchParallelFor(total, std::forward(fn), num_batches); + } else { +#ifdef USE_OPENMP +#pragma omp parallel for +#endif + for (int32_t i = 0; i < total; ++i) { + fn(i); + } + } + } + + /** + Tries to call the given function in parallel. + **/ + template + inline static void TryParallelFor(concurrency::ThreadPool* tp, int32_t total, F&& fn) { + if (tp != nullptr) { + tp->ParallelFor(total, std::forward(fn)); + } else { +#ifdef USE_OPENMP +#pragma omp parallel for +#endif + for (int32_t i = 0; i < total; ++i) { + fn(i); + } + } + } + int NumThreads() const; int CurrentThreadId() const; diff --git a/onnxruntime/contrib_ops/cpu/crop_and_resize.cc b/onnxruntime/contrib_ops/cpu/crop_and_resize.cc index f966c9c821..f486d6a6a9 100644 --- a/onnxruntime/contrib_ops/cpu/crop_and_resize.cc +++ b/onnxruntime/contrib_ops/cpu/crop_and_resize.cc @@ -43,17 +43,6 @@ namespace contrib { ADD_TYPED_CROPANDRESIZE_OP(float); -template -static void TryParallelFor(concurrency::ThreadPool* tp, int32_t total, T&& fn) { - if (tp != nullptr) - tp->ParallelFor(total, fn); - else { - for (int32_t i = 0; i != total; ++i) { - fn(i); - } - } -} - template void CropAndResizeForward(const TensorShape& output_shape, const T* bottom_data, @@ -71,9 +60,7 @@ void CropAndResizeForward(const TensorShape& output_shape, int64_t pooled_height = output_shape[2]; int64_t pooled_width = output_shape[3]; - // TODO: This should do blocks of work based on the number of threads in the threadpool with each block - // being n_rois / num_threads - std::function work_object = [&](int32_t n) { + ThreadPool::TryBatchParallelFor(ttp, static_cast(n_rois), [&](int32_t n) { int64_t index_n = n * channels * pooled_width * pooled_height; const T* offset_bottom_rois = bottom_rois + n * num_roi_cols; @@ -182,9 +169,7 @@ void CropAndResizeForward(const TensorShape& output_shape, } } // for pw } // for ph - }; // for n - - TryParallelFor(ttp, static_cast(n_rois), work_object); + }); // for n } template diff --git a/onnxruntime/core/common/threadpool.cc b/onnxruntime/core/common/threadpool.cc index 0595f8c56e..9f08dc68bb 100644 --- a/onnxruntime/core/common/threadpool.cc +++ b/onnxruntime/core/common/threadpool.cc @@ -57,6 +57,36 @@ void ThreadPool::ParallelFor(int32_t total, std::function fn) { barrier.Wait(); } +void ThreadPool::BatchParallelFor(int32_t total, std::function fn, int32_t num_batches) { + if (total <= 0) + return; + + if (total == 1) { + fn(0); + return; + } + + if (num_batches <= 1) { + for (int i = 0; i < total; i++) { + fn(i); + } + return; + } + + if (num_batches >= total) { + ParallelFor(total, fn); + return; + } + + ParallelFor(num_batches, [&](int batch_index) { + int start = batch_index * total / num_batches; + int end = (batch_index + 1) * total / num_batches; + for (int i = start; i < end; i++) { + fn(i); + } + }); +} + void ThreadPool::ParallelForRange(int64_t first, int64_t last, std::function fn) { if (last <= first) return; if (last - first == 1) { diff --git a/onnxruntime/core/providers/cpu/object_detection/roialign.cc b/onnxruntime/core/providers/cpu/object_detection/roialign.cc index e68258cb3b..f3bab71ef5 100644 --- a/onnxruntime/core/providers/cpu/object_detection/roialign.cc +++ b/onnxruntime/core/providers/cpu/object_detection/roialign.cc @@ -42,17 +42,6 @@ ADD_TYPED_ROIALIGN_OP(float); ADD_TYPED_ROIALIGN_OP(double); namespace { -template -void TryParallelFor(concurrency::ThreadPool* tp, int32_t total, T&& fn) { - if (tp != nullptr) - tp->ParallelFor(total, fn); - else { - for (int32_t i = 0; i != total; ++i) { - fn(i); - } - } -} - template struct PreCalc { int64_t pos1; @@ -183,9 +172,7 @@ void RoiAlignForward(const TensorShape& output_shape, int64_t pooled_height = output_shape[2]; int64_t pooled_width = output_shape[3]; - // TODO: This should do blocks of work based on the number of threads in the threadpool with each block - // being n_rois / num_threads - std::function work_object = [&](int32_t n) { + ThreadPool::TryBatchParallelFor(ttp, static_cast(n_rois), [&](int32_t n) { int64_t index_n = n * channels * pooled_width * pooled_height; const T* offset_bottom_rois = bottom_rois + n * num_roi_cols; @@ -281,9 +268,7 @@ void RoiAlignForward(const TensorShape& output_shape, } // for pw } // for ph } // for c - }; // for n - - TryParallelFor(ttp, static_cast(n_rois), work_object); + }); // for n } } // namespace diff --git a/onnxruntime/test/platform/threadpool_test.cc b/onnxruntime/test/platform/threadpool_test.cc new file mode 100644 index 0000000000..ec628bc293 --- /dev/null +++ b/onnxruntime/test/platform/threadpool_test.cc @@ -0,0 +1,101 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/platform/threadpool.h" + +#include + +#include "gtest/gtest.h" +#include +#include +#include +#include + +using namespace onnxruntime::concurrency; + +namespace { + +struct TestData { + explicit TestData(int num) : data(num, 0) {} + std::vector data; + std::mutex mutex; +}; + +// This unittest tests ThreadPool function by counting the number of calls to function with each index. +// the function should be called exactly once for each element. + +std::unique_ptr CreateTestData(int num) { + return onnxruntime::make_unique(num); +} + +void IncrementElement(TestData& test_data, int i) { + std::lock_guard lock(test_data.mutex); + test_data.data[i]++; +} + +void ValidateTestData(TestData& test_data) { + ASSERT_TRUE(std::count_if(test_data.data.cbegin(), + test_data.data.cend(), + [](int i) { return i != 1; }) == 0); +} + +void CreateThreadPoolAndTest(const std::string& name, int num_threads, const std::function& test_body) { + auto tp = onnxruntime::make_unique(name, num_threads); + test_body(tp.get()); +} + +void TestParallelFor(const std::string& name, int num_threads, int num_tasks) { + auto test_data = CreateTestData(num_tasks); + CreateThreadPoolAndTest(name, num_threads, [&](ThreadPool* tp) { + tp->ParallelFor(num_tasks, [&](int i) { + IncrementElement(*test_data, i); + }); + }); + ValidateTestData(*test_data); +} + +void TestBatchParallelFor(const std::string& name, int num_threads, int num_tasks, int batch_size) { + auto test_data = CreateTestData(num_tasks); + CreateThreadPoolAndTest(name, num_threads, [&](ThreadPool* tp) { + tp->BatchParallelFor( + num_tasks, [&](int i) { + IncrementElement(*test_data, i); + }, + batch_size); + }); + ValidateTestData(*test_data); +} + +} // namespace + +TEST(ThreadPoolTest, TestParallelFor_2_Thread_NoTask) { + TestParallelFor("TestParallelFor_2_Thread_NoTask", 2, 0); +} + +TEST(ThreadPoolTest, TestParallelFor_2_Thread_50_Task) { + TestParallelFor("TestParallelFor_2_Thread_50_Task", 2, 50); +} + +TEST(ThreadPoolTest, TestParallelFor_1_Thread_50_Task) { + TestParallelFor("TestParallelFor_1_Thread_50_Task", 1, 50); +} + +TEST(ThreadPoolTest, TestBatchParallelFor_2_Thread_50_Task_10_Batch) { + TestBatchParallelFor("TestBatchParallelFor_2_Thread_50_Task_10_Batch", 2, 50, 10); +} + +TEST(ThreadPoolTest, TestBatchParallelFor_2_Thread_50_Task_0_Batch) { + TestBatchParallelFor("TestBatchParallelFor_2_Thread_50_Task_0_Batch", 2, 50, 0); +} + +TEST(ThreadPoolTest, TestBatchParallelFor_2_Thread_50_Task_1_Batch) { + TestBatchParallelFor("TestBatchParallelFor_2_Thread_50_Task_1_Batch", 2, 50, 1); +} + +TEST(ThreadPoolTest, TestBatchParallelFor_2_Thread_50_Task_100_Batch) { + TestBatchParallelFor("TestBatchParallelFor_2_Thread_50_Task_100_Batch", 2, 50, 100); +} + +TEST(ThreadPoolTest, TestBatchParallelFor_2_Thread_81_Task_20_Batch) { + TestBatchParallelFor("TestBatchParallelFor_2_Thread_81_Task_20_Batch", 2, 81, 20); +} From 7c7d5a149c9ed52eec67304bae5c4b132166a8a1 Mon Sep 17 00:00:00 2001 From: Yulong Wang Date: Wed, 27 Nov 2019 02:09:18 -0800 Subject: [PATCH 04/13] Disable thread pool creation when enabled OpenMP (#2485) --- onnxruntime/core/framework/session_options.h | 1 + onnxruntime/core/session/inference_session.cc | 4 ++++ onnxruntime/test/perftest/README.md | 2 +- onnxruntime/test/perftest/command_args_parser.cc | 7 ++++++- onnxruntime/test/perftest/ort_test_session.cc | 5 ++++- 5 files changed, 16 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h index 3ab220b0b6..27ac93f403 100644 --- a/onnxruntime/core/framework/session_options.h +++ b/onnxruntime/core/framework/session_options.h @@ -55,6 +55,7 @@ struct SessionOptions { TransformerLevel graph_optimization_level = TransformerLevel::Level1; // controls the size of the thread pool used to parallelize the execution of tasks within individual nodes (ops) + // if OpenMP is enabled, this configuration will be ignored int intra_op_num_threads = 0; // controls the size of the thread pool used to parallelize the execution of nodes (ops) diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 84126926ce..5b13df4387 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -102,8 +102,12 @@ InferenceSession::InferenceSession(const SessionOptions& session_options, : session_options_(session_options), graph_transformation_mgr_(session_options.max_num_graph_transformation_steps), logging_manager_(logging_manager), +#ifndef USE_OPENMP thread_pool_(concurrency::CreateThreadPool("intra_op_thread_pool", session_options.intra_op_num_threads)), +#else + thread_pool_(nullptr), +#endif inter_op_thread_pool_(session_options.execution_mode == ExecutionMode::ORT_PARALLEL ? concurrency::CreateThreadPool("inter_op_thread_pool", session_options.inter_op_num_threads) diff --git a/onnxruntime/test/perftest/README.md b/onnxruntime/test/perftest/README.md index ae8851d546..9f3cba19c0 100644 --- a/onnxruntime/test/perftest/README.md +++ b/onnxruntime/test/perftest/README.md @@ -32,7 +32,7 @@ Options: -v: Show verbose information. - -x: [intra_op_num_threads]: Sets the number of threads used to parallelize the execution within nodes. A value of 0 means the test will auto-select a default. Must >=0. + -x: [intra_op_num_threads]: Sets the number of threads used to parallelize the execution within nodes. A value of 0 means the test will auto-select a default. Must >=0. If OpenMP is enabled, this configuration will be ignored. -y: [inter_op_num_threads]: Sets the number of threads used to parallelize the execution of the graph (across nodes), A value of 0 means the test will auto-select a default. Must >=0. diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index 4b03aa7a12..30c8937828 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -41,7 +41,7 @@ namespace perftest { "\t-p [profile_file]: Specifies the profile name to enable profiling and dump the profile data to the file.\n" "\t-s: Show statistics result, like P75, P90.\n" "\t-v: Show verbose information.\n" - "\t-x [intra_op_num_threads]: Sets the number of threads used to parallelize the execution within nodes, A value of 0 means ORT will pick a default. Must >=0.\n" + "\t-x [intra_op_num_threads]: Sets the number of threads used to parallelize the execution within nodes, A value of 0 means ORT will pick a default. Must >=0. If OpenMP is enabled, this configuration will be ignored.\n" "\t-y [inter_op_num_threads]: Sets the number of threads used to parallelize the execution of the graph (across nodes), A value of 0 means ORT will pick a default. Must >=0.\n" "\t-P: Use parallel executor instead of sequential executor.\n" "\t-o [optimization level]: Default is 1. Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).\n" @@ -123,10 +123,15 @@ namespace perftest { test_config.run_config.f_verbose = true; break; case 'x': +#ifdef USE_OPENMP + fprintf(stderr, "cannot use argument '-x' when OpenMP is enabled.\n"); + return false; +#else test_config.run_config.intra_op_num_threads = static_cast(OrtStrtol(optarg, nullptr)); if (test_config.run_config.intra_op_num_threads < 0) { return false; } +#endif break; case 'y': test_config.run_config.inter_op_num_threads = static_cast(OrtStrtol(optarg, nullptr)); diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index 5c8c5a8a99..2e16451ae2 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -82,7 +82,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device } else if (provider_name == onnxruntime::kAclExecutionProvider) { #ifdef USE_ACL Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ACL(session_options, - performance_test_config.run_config.enable_cpu_mem_arena ? 1 : 0)); + performance_test_config.run_config.enable_cpu_mem_arena ? 1 : 0)); #else ORT_THROW("Acl is not supported in this build\n"); #endif @@ -100,8 +100,11 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device else session_options.DisableMemPattern(); session_options.SetExecutionMode(performance_test_config.run_config.execution_mode); + +#ifndef USE_OPENMP fprintf(stdout, "Setting intra_op_num_threads to %d\n", performance_test_config.run_config.intra_op_num_threads); session_options.SetIntraOpNumThreads(performance_test_config.run_config.intra_op_num_threads); +#endif if (performance_test_config.run_config.execution_mode == ExecutionMode::ORT_PARALLEL) { fprintf(stdout, "Setting inter_op_num_threads to %d\n", performance_test_config.run_config.inter_op_num_threads); From e57b735bb9a4912eb38f9de9d272d42c03779569 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 27 Nov 2019 10:15:50 -0800 Subject: [PATCH 05/13] Add a transformer to use Gelu approximation for cuda provider (#2480) * Add Gelu Approximation Transformer to convert Gelu or AddGeluFusion to FastGelu to get better inference performance. --- .../core/optimizer/graph_transformer_level.h | 5 +- .../core/optimizer/gelu_approximation.cc | 119 ++++++++++++++++++ .../core/optimizer/gelu_approximation.h | 24 ++++ .../core/optimizer/graph_transformer_utils.cc | 3 +- .../core/session/abi_session_options.cc | 2 +- onnxruntime/core/session/inference_session.cc | 20 ++- .../test/framework/inference_session_test.cc | 2 +- .../test/optimizer/graph_transform_test.cc | 70 +++++++++++ .../transform/approximation/gelu.onnx | 13 ++ .../approximation/gelu_add_bias.onnx | 17 +++ .../approximation/gelu_add_matmul.onnx | 25 ++++ .../gelu_add_shape_not_match.onnx | 19 +++ .../approximation/gelu_approximation_gen.py | 86 +++++++++++++ 13 files changed, 387 insertions(+), 18 deletions(-) create mode 100644 onnxruntime/core/optimizer/gelu_approximation.cc create mode 100644 onnxruntime/core/optimizer/gelu_approximation.h create mode 100644 onnxruntime/test/testdata/transform/approximation/gelu.onnx create mode 100644 onnxruntime/test/testdata/transform/approximation/gelu_add_bias.onnx create mode 100644 onnxruntime/test/testdata/transform/approximation/gelu_add_matmul.onnx create mode 100644 onnxruntime/test/testdata/transform/approximation/gelu_add_shape_not_match.onnx create mode 100644 onnxruntime/test/testdata/transform/approximation/gelu_approximation_gen.py diff --git a/include/onnxruntime/core/optimizer/graph_transformer_level.h b/include/onnxruntime/core/optimizer/graph_transformer_level.h index 4f2d5b305c..7aeb00ba66 100644 --- a/include/onnxruntime/core/optimizer/graph_transformer_level.h +++ b/include/onnxruntime/core/optimizer/graph_transformer_level.h @@ -12,9 +12,8 @@ enum class TransformerLevel : int { Level1, Level2, Level3, - // Convenience enum to always get the max available value. - // This way when we add more levels code which iterates over this enum does not need to change. - MaxTransformerLevel + // The max level should always be same as the last level. + MaxLevel = Level3 }; } // namespace onnxruntime diff --git a/onnxruntime/core/optimizer/gelu_approximation.cc b/onnxruntime/core/optimizer/gelu_approximation.cc new file mode 100644 index 0000000000..9982a3836d --- /dev/null +++ b/onnxruntime/core/optimizer/gelu_approximation.cc @@ -0,0 +1,119 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/optimizer/initializer.h" +#include "core/optimizer/gelu_approximation.h" +#include "core/framework/tensorprotoutils.h" +#include "core/optimizer/utils.h" +#include "core/graph/graph_utils.h" +#include "float.h" + +using namespace ONNX_NAMESPACE; +using namespace onnxruntime::common; +namespace onnxruntime { + +// FastGelu supports limited data types. +static std::vector supported_data_types{"tensor(float16)", "tensor(float)"}; + +static bool IsSupportedDataType(const Node& node) { + for (const auto& input_arg : node.InputDefs()) { + if (std::find(supported_data_types.begin(), supported_data_types.end(), + *(input_arg->Type())) == supported_data_types.end()) { + return false; + } + } + return true; +} + +static bool CheckInputShape(const Node& node, const NodeArg& input, const NodeArg& bias) { + const TensorShapeProto* bias_shape = bias.Shape(); + if (nullptr == bias_shape || + bias_shape->dim_size() != 1 || + !utils::HasDimValue(bias_shape->dim(0))) { + return false; + } + auto bias_length = bias_shape->dim(0).dim_value(); + + const TensorShapeProto* input_shape = input.Shape(); + if (nullptr != input_shape) { + if (input_shape->dim_size() >= 1) { + int last_dim = input_shape->dim_size() - 1; + if (utils::HasDimValue(input_shape->dim(last_dim)) && + input_shape->dim(last_dim).dim_value() == bias_length) { + return true; + } + } + return false; + } + + // Input does not have shape. We will check its parent node. + // When the parent is MatMul and its 2nd input has shape like {*, bias_length}, + // it means that the shape of MatMul output is good for FastGelu. + const Node* parent_node = graph_utils::GetInputNode(node, 0); + if (nullptr != parent_node && + graph_utils::IsSupportedOptypeVersionAndDomain(*parent_node, "MatMul", {1, 9}, kOnnxDomain)) { + const NodeArg& input_b = *(parent_node->InputDefs()[1]); + if (optimizer_utils::ValidateShape(input_b, {-1, bias_length})) { + return true; + } + } + + return false; +} + +static bool CheckGeluInputShape(const NodeArg& input) { + const TensorShapeProto* input_shape = input.Shape(); + return nullptr != input_shape && input_shape->dim_size() >= 1; +} + +static bool IsCandidateNode(Node& node, const std::unordered_set& compatible_providers) { + if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "AddGeluFusion", {1}, kMSDomain)) { + return graph_utils::IsSupportedProvider(node, compatible_providers) && + IsSupportedDataType(node) && + CheckInputShape(node, *(node.InputDefs()[0]), *(node.InputDefs()[1])); + } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "Gelu", {1}, kMSDomain)) { + return graph_utils::IsSupportedProvider(node, compatible_providers) && + IsSupportedDataType(node) && + CheckGeluInputShape(*(node.InputDefs()[0])); + } + return false; +} + +Status GeluApproximation::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const { + GraphViewer graph_viewer(graph); + const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder(); + + int count = 0; + for (auto node_index : node_topology_list) { + auto* p_node = graph.GetNode(node_index); + if (p_node == nullptr) + continue; // we removed the node as part of an earlier fusion + + Node& node = *p_node; + ORT_RETURN_IF_ERROR(Recurse(node, modified, graph_level, logger)); + + if (IsCandidateNode(node, GetCompatibleExecutionProviders())) { + Node& fastgelu = graph.AddNode( + graph.GenerateNodeName("FastGelu"), + "FastGelu", + "Gelu approximation", + node.MutableInputDefs(), + node.MutableOutputDefs(), nullptr, kMSDomain); + + fastgelu.SetExecutionProviderType(node.GetExecutionProviderType()); + + graph_utils::RemoveNodeOutputEdges(graph, node); + graph.RemoveNode(node.Index()); + + count++; + } + } + + if (count > 0) { + modified = true; + LOGS(logger, INFO) << "Total Gelu Approximation (FastGelu) node count: " << count; + } + + return Status::OK(); +} +} // namespace onnxruntime diff --git a/onnxruntime/core/optimizer/gelu_approximation.h b/onnxruntime/core/optimizer/gelu_approximation.h new file mode 100644 index 0000000000..def16eec92 --- /dev/null +++ b/onnxruntime/core/optimizer/gelu_approximation.h @@ -0,0 +1,24 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/optimizer/graph_transformer.h" + +namespace onnxruntime { + +/** +@Class GeluApproximation + +Rewrite graph to replace Gelu or AddGeluFusion by FastGelu node. FastGelu uses approximation for Gelu, +and it is faster. +*/ +class GeluApproximation : public GraphTransformer { + public: + GeluApproximation(const std::unordered_set& compatible_execution_providers={}) noexcept + : GraphTransformer("GeluApproximation", compatible_execution_providers) {} + + Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override; +}; + +} // namespace onnxruntime diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc index 8845752d2c..3f0b6c1b43 100644 --- a/onnxruntime/core/optimizer/graph_transformer_utils.cc +++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc @@ -20,6 +20,7 @@ #include "core/optimizer/free_dim_override_transformer.h" #include "core/optimizer/add_gelu_fusion.h" #include "core/optimizer/gelu_fusion.h" +#include "core/optimizer/gelu_approximation.h" #include "core/optimizer/layer_norm_fusion.h" #include "core/optimizer/skip_layer_norm_fusion.h" #include "core/optimizer/reshape_fusion.h" @@ -130,6 +131,7 @@ std::vector> GenerateTransformers(TransformerL std::unordered_set cuda_execution_providers = {onnxruntime::kCudaExecutionProvider}; transformers.emplace_back(onnxruntime::make_unique(cuda_execution_providers)); + transformers.emplace_back(onnxruntime::make_unique(cuda_execution_providers)); transformers.emplace_back(onnxruntime::make_unique(cuda_execution_providers)); #endif @@ -142,7 +144,6 @@ std::vector> GenerateTransformers(TransformerL transformers.emplace_back(onnxruntime::make_unique()); } #endif - } break; default: diff --git a/onnxruntime/core/session/abi_session_options.cc b/onnxruntime/core/session/abi_session_options.cc index 026e7425fc..d032e2dbd5 100644 --- a/onnxruntime/core/session/abi_session_options.cc +++ b/onnxruntime/core/session/abi_session_options.cc @@ -130,7 +130,7 @@ ORT_API_STATUS_IMPL(OrtApis::SetSessionGraphOptimizationLevel, _In_ OrtSessionOp options->value.graph_optimization_level = onnxruntime::TransformerLevel::Level2; break; case ORT_ENABLE_ALL: - options->value.graph_optimization_level = onnxruntime::TransformerLevel::Level3; + options->value.graph_optimization_level = onnxruntime::TransformerLevel::MaxLevel; break; default: return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "graph_optimization_level is not valid"); diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 5b13df4387..c9aad03a7d 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -404,7 +404,7 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, // apply transformers except default transformers // Default transformers are required for correctness and they are owned and run by inference session - for (int i = static_cast(TransformerLevel::Level1); i < static_cast(TransformerLevel::MaxTransformerLevel); i++) { + for (int i = static_cast(TransformerLevel::Level1); i <= static_cast(TransformerLevel::MaxLevel); i++) { ORT_RETURN_IF_ERROR_SESSIONID_(graph_transformer_mgr.ApplyTransformers(graph, static_cast(i), *session_logger_)); } @@ -1141,21 +1141,17 @@ void InferenceSession::AddPredefinedTransformers(GraphTransformerManager& transf } }; - ORT_ENFORCE(graph_optimization_level < TransformerLevel::MaxTransformerLevel, - "Allowed values are 1 and 2. Current level is set to " + + ORT_ENFORCE(graph_optimization_level <= TransformerLevel::MaxLevel, + "Exceeded max transformer level. Current level is set to " + std::to_string(static_cast(graph_optimization_level))); - if ((graph_optimization_level >= TransformerLevel::Level1) || !custom_list.empty()) { - add_transformers(TransformerLevel::Level1); + for (int i = static_cast(TransformerLevel::Level1); i <= static_cast(TransformerLevel::MaxLevel); i++) { + TransformerLevel level = static_cast(i); + if ((graph_optimization_level >= level) || !custom_list.empty()) { + add_transformers(level); + } } - if ((graph_optimization_level >= TransformerLevel::Level2) || !custom_list.empty()) { - add_transformers(TransformerLevel::Level2); - } - - if ((graph_optimization_level >= TransformerLevel::Level3) || !custom_list.empty()) { - add_transformers(TransformerLevel::Level3); - } } common::Status InferenceSession::WaitForNotification(Notification* p_executor_done, int64_t timeout_in_ms) { diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc index 44f221aa3e..e5abaa7176 100644 --- a/onnxruntime/test/framework/inference_session_test.cc +++ b/onnxruntime/test/framework/inference_session_test.cc @@ -1428,7 +1428,7 @@ TEST(InferenceSessionTests, TestCopyToFromDevices) { TEST(InferenceSessionTests, TestRegisterTransformers) { string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx"; - for (int i = static_cast(TransformerLevel::Default); i < static_cast(TransformerLevel::MaxTransformerLevel); i++) { + for (int i = static_cast(TransformerLevel::Default); i <= static_cast(TransformerLevel::MaxLevel); i++) { SessionOptions so; so.session_logid = "InferenceSessionTests.TestL1AndL2Transformers"; so.graph_optimization_level = static_cast(i); diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index 9ef55cab71..9915505477 100644 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -16,6 +16,7 @@ #include "core/optimizer/gemm_activation_fusion.h" #include "core/optimizer/add_gelu_fusion.h" #include "core/optimizer/gelu_fusion.h" +#include "core/optimizer/gelu_approximation.h" #include "core/optimizer/layer_norm_fusion.h" #include "core/optimizer/skip_layer_norm_fusion.h" #include "core/optimizer/graph_transformer.h" @@ -1101,6 +1102,75 @@ TEST(GraphTransformationTests, AddGeluFusionTest) { ASSERT_TRUE(op_to_count["GeluFusion"] == 0); } +// Test Gelu -> FastGelu +TEST(GraphTransformationTests, GeluApproximation_Gelu) { + auto model_uri = MODEL_FOLDER "approximation/gelu.onnx"; + std::shared_ptr p_model; + ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK()); + Graph& graph = p_model->MainGraph(); + + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + graph_transformation_mgr.Register(onnxruntime::make_unique(), TransformerLevel::Level2); + auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger()); + ASSERT_TRUE(ret.IsOK()); + + std::map op_to_count = CountOpsInGraph(graph); + EXPECT_EQ(op_to_count["Gelu"], 0); + EXPECT_EQ(op_to_count["FastGelu"], 1); +} + +// Test AddGeluFusion -> FastGelu +TEST(GraphTransformationTests, GeluApproximation_Gelu_Add_Bias) { + auto model_uri = MODEL_FOLDER "approximation/gelu_add_bias.onnx"; + std::shared_ptr p_model; + ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK()); + Graph& graph = p_model->MainGraph(); + + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + graph_transformation_mgr.Register(onnxruntime::make_unique(), TransformerLevel::Level2); + auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger()); + ASSERT_TRUE(ret.IsOK()); + + std::map op_to_count = CountOpsInGraph(graph); + EXPECT_EQ(op_to_count["AddGeluFusion"], 0); + EXPECT_EQ(op_to_count["FastGelu"], 1); +} + +// Test MatMul & AddGeluFusion -> MatMul & FastGelu +TEST(GraphTransformationTests, GeluApproximation_Gelu_Add_MatMul) { + auto model_uri = MODEL_FOLDER "approximation/gelu_add_matmul.onnx"; + std::shared_ptr p_model; + ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK()); + Graph& graph = p_model->MainGraph(); + + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + graph_transformation_mgr.Register(onnxruntime::make_unique(), TransformerLevel::Level2); + auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger()); + ASSERT_TRUE(ret.IsOK()); + + std::map op_to_count = CountOpsInGraph(graph); + EXPECT_EQ(op_to_count["AddGeluFusion"], 0); + EXPECT_EQ(op_to_count["MatMul"], 1); + EXPECT_EQ(op_to_count["FastGelu"], 1); +} + +// Test AddGeluFusion with mis-match bias shape cannot convert to FastGelu. +TEST(GraphTransformationTests, GeluApproximation_Gelu_Add_Shape_Not_Match) { + auto model_uri = MODEL_FOLDER "approximation/gelu_add_shape_not_match.onnx"; + std::shared_ptr p_model; + ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK()); + Graph& graph = p_model->MainGraph(); + + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + graph_transformation_mgr.Register(onnxruntime::make_unique(), TransformerLevel::Level2); + auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger()); + ASSERT_TRUE(ret.IsOK()); + + std::map op_to_count = CountOpsInGraph(graph); + EXPECT_EQ(op_to_count["AddGeluFusion"], 1); + EXPECT_EQ(op_to_count["FastGelu"], 0); +} + TEST(GraphTransformationTests, LayerNormFusionTest) { auto model_uri = MODEL_FOLDER "fusion/layer_norm.onnx"; std::shared_ptr p_model; diff --git a/onnxruntime/test/testdata/transform/approximation/gelu.onnx b/onnxruntime/test/testdata/transform/approximation/gelu.onnx new file mode 100644 index 0000000000..b183a1fc84 --- /dev/null +++ b/onnxruntime/test/testdata/transform/approximation/gelu.onnx @@ -0,0 +1,13 @@ +:~ +# +ACGelu_1"Gelu: com.microsoft Gelu_NoBiasZ$ +A + +batch + seq_len +€b$ +C + +batch + seq_len +€B \ No newline at end of file diff --git a/onnxruntime/test/testdata/transform/approximation/gelu_add_bias.onnx b/onnxruntime/test/testdata/transform/approximation/gelu_add_bias.onnx new file mode 100644 index 0000000000..b96cb4752a --- /dev/null +++ b/onnxruntime/test/testdata/transform/approximation/gelu_add_bias.onnx @@ -0,0 +1,17 @@ +:¦ +8 +A +BCAddGeluFusion_1" AddGeluFusion: com.microsoft Gelu_AddBiasZ$ +A + +batch + seq_len +€Z +B +  +€b$ +C + +batch + seq_len +€B \ No newline at end of file diff --git a/onnxruntime/test/testdata/transform/approximation/gelu_add_matmul.onnx b/onnxruntime/test/testdata/transform/approximation/gelu_add_matmul.onnx new file mode 100644 index 0000000000..688fa1cd68 --- /dev/null +++ b/onnxruntime/test/testdata/transform/approximation/gelu_add_matmul.onnx @@ -0,0 +1,25 @@ +:â + +A +BCMatMul_1"MatMul +8 +C +DEAddGeluFusion_1" AddGeluFusion: com.microsoftMatMul_AddGeluFusionZ$ +A + +batch + seq_len +xZ +B + + +€ +€Z +D +  +€b$ +E + +batch + seq_len +€B \ No newline at end of file diff --git a/onnxruntime/test/testdata/transform/approximation/gelu_add_shape_not_match.onnx b/onnxruntime/test/testdata/transform/approximation/gelu_add_shape_not_match.onnx new file mode 100644 index 0000000000..5061df5da8 --- /dev/null +++ b/onnxruntime/test/testdata/transform/approximation/gelu_add_shape_not_match.onnx @@ -0,0 +1,19 @@ +:Ä +8 +A +BCAddGeluFusion_1" AddGeluFusion: com.microsoftGelu_Add_ShapeNotMatchZ$ +A + +batch + seq_len +€Z$ +B + +batch + seq_len +€b$ +C + +batch + seq_len +€B \ No newline at end of file diff --git a/onnxruntime/test/testdata/transform/approximation/gelu_approximation_gen.py b/onnxruntime/test/testdata/transform/approximation/gelu_approximation_gen.py new file mode 100644 index 0000000000..65dae0cbe8 --- /dev/null +++ b/onnxruntime/test/testdata/transform/approximation/gelu_approximation_gen.py @@ -0,0 +1,86 @@ +import onnx +from onnx import helper +from onnx import TensorProto + +graph = helper.make_graph( + [ # nodes + # Add node before Gelu + helper.make_node("Gelu", ["A"], ["C"], "Gelu_1", domain="com.microsoft"), + ], + "Gelu_NoBias", #name + [ # inputs + helper.make_tensor_value_info('A', TensorProto.FLOAT, ['batch', 'seq_len', 3072]), + ], + [ # outputs + helper.make_tensor_value_info('C', TensorProto.FLOAT, ['batch', 'seq_len', 3072]), + ], + [ # initializers + ] +) + +model = helper.make_model(graph) +onnx.save(model, r'gelu.onnx') + +graph = helper.make_graph( + [ # nodes + # Add node before Gelu + helper.make_node("AddGeluFusion", ["A", "B"], ["C"], "AddGeluFusion_1", domain="com.microsoft"), + ], + "Gelu_AddBias", #name + [ # inputs + helper.make_tensor_value_info('A', TensorProto.FLOAT, ['batch', 'seq_len', 3072]), + helper.make_tensor_value_info('B', TensorProto.FLOAT, [3072]), + ], + [ # outputs + helper.make_tensor_value_info('C', TensorProto.FLOAT, ['batch', 'seq_len', 3072]), + ], + [ # initializers + ] +) + +model = helper.make_model(graph) +onnx.save(model, r'gelu_add_bias.onnx') + +graph = helper.make_graph( + [ # nodes + # Add node before Gelu + helper.make_node("AddGeluFusion", ["A", "B"], ["C"], "AddGeluFusion_1", domain="com.microsoft"), + ], + "Gelu_Add_ShapeNotMatch", #name + [ # inputs + helper.make_tensor_value_info('A', TensorProto.FLOAT, ['batch', 'seq_len', 3072]), + helper.make_tensor_value_info('B', TensorProto.FLOAT, ['batch', 'seq_len', 3072]), # Bias shape not matched for FastGelu + ], + [ # outputs + helper.make_tensor_value_info('C', TensorProto.FLOAT, ['batch', 'seq_len', 3072]), + ], + [ # initializers + ] +) + +model = helper.make_model(graph) +onnx.save(model, r'gelu_add_shape_not_match.onnx') + + +graph = helper.make_graph( + [ # nodes + # Add node before Gelu + helper.make_node("MatMul", ["A", "B"], ["C"], "MatMul_1"), + helper.make_node("AddGeluFusion", ["C", "D"], ["E"], "AddGeluFusion_1", domain="com.microsoft"), + ], + "MatMul_AddGeluFusion", #name + [ # inputs + helper.make_tensor_value_info('A', TensorProto.FLOAT, ['batch', 'seq_len', 'x']), + helper.make_tensor_value_info('B', TensorProto.FLOAT, [128, 3072]), + helper.make_tensor_value_info('D', TensorProto.FLOAT, [3072]), + ], + [ # outputs + helper.make_tensor_value_info('E', TensorProto.FLOAT, ['batch', 'seq_len', 3072]), + ], + [ # initializers + ] +) + +model = helper.make_model(graph) +onnx.save(model, r'gelu_add_matmul.onnx') + From ccbd778d0d15f2597f2cc5d644a58e99b44cca27 Mon Sep 17 00:00:00 2001 From: Yulong Wang Date: Wed, 27 Nov 2019 12:34:57 -0800 Subject: [PATCH 06/13] optimize CPU implementation of EmbedLayerNorm (#2491) * optimize CPU implementation of EmbedLayerNorm * use atomic in parallelization --- .../contrib_ops/cpu/bert/embed_layer_norm.cc | 84 ++++++++++++------- 1 file changed, 55 insertions(+), 29 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/bert/embed_layer_norm.cc b/onnxruntime/contrib_ops/cpu/bert/embed_layer_norm.cc index b3b77aa0e1..21a24e7e4c 100644 --- a/onnxruntime/contrib_ops/cpu/bert/embed_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/bert/embed_layer_norm.cc @@ -4,6 +4,9 @@ #include "embed_layer_norm.h" #include "embed_layer_norm_helper.h" #include "core/util/math_cpuonly.h" +#include "core/platform/threadpool.h" + +#include namespace onnxruntime { namespace contrib { @@ -60,40 +63,63 @@ Status EmbedLayerNorm::Compute(OpKernelContext* context) const { int position_embedding_length = static_cast(position_embedding->Shape()[0]); int segment_embedding_length = static_cast(segment_embedding->Shape()[0]); - ConstEigenArrayMap word_embedding_arr(word_embedding->template Data(), hidden_size, word_embedding_length); - ConstEigenArrayMap position_embedding_arr(position_embedding->template Data(), hidden_size, position_embedding_length); - ConstEigenArrayMap segment_embedding_arr(segment_embedding->template Data(), hidden_size, segment_embedding_length); - ConstEigenVectorMap gamma_vector(gamma->template Data(), hidden_size); - ConstEigenVectorMap beta_vector(beta->template Data(), hidden_size); - EigenArrayMap output_arr(output->template MutableData(), hidden_size, batch_size * sequence_length); + auto input_ids_data = input_ids->template Data(); + auto segment_ids_data = segment_ids->template Data(); + auto word_embedding_data = word_embedding->template Data(); + auto position_embedding_data = position_embedding->template Data(); + auto segment_embedding_data = segment_embedding->template Data(); + auto gamma_data = gamma->template Data(); + auto beta_data = beta->template Data(); + auto output_data = output->template MutableData(); // Calculate output { - size_t index = 0; - for (int b = 0; b < batch_size; b++) { - for (int s = 0; s < sequence_length; s++) { - int word_col_index = input_ids->template Data()[index]; - if (word_col_index < 0 || word_col_index >= word_embedding_length) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "word_col_index out of range"); - } - int position_col_index = s; - if (position_col_index >= position_embedding_length) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "position_col_index out of range"); - } - int segment_col_index = segment_ids->template Data()[index]; - if (segment_col_index < 0 || segment_col_index >= segment_embedding_length) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "segment_col_index out of range"); - } + std::atomic_bool failed{false}; - output_arr.col(index) = word_embedding_arr.col(word_col_index) + - position_embedding_arr.col(position_col_index) + - segment_embedding_arr.col(segment_col_index); - output_arr.col(index) -= output_arr.col(index).mean(); - output_arr.col(index) /= static_cast(sqrt(output_arr.col(index).pow(2).mean() + 1.0e-13)); - output_arr.col(index) *= gamma_vector.array(); - output_arr.col(index) += beta_vector.array(); - index++; + int n = batch_size * sequence_length; + concurrency::ThreadPool::TryBatchParallelFor(context->GetOperatorThreadPool(), n, [=, &failed](int index) { + int word_col_index = input_ids_data[index]; + if (word_col_index < 0 || word_col_index >= word_embedding_length) { + failed.store(true, std::memory_order_release); + return; } + int position_col_index = index % sequence_length; + if (position_col_index >= position_embedding_length) { + failed.store(true, std::memory_order_release); + return; + } + int segment_col_index = segment_ids_data[index]; + if (segment_col_index < 0 || segment_col_index >= segment_embedding_length) { + failed.store(true, std::memory_order_release); + return; + } + + T* y = output_data + index * hidden_size; + const T* input_word_embedding = word_embedding_data + word_col_index * hidden_size; + const T* input_position_embedding = position_embedding_data + position_col_index * hidden_size; + const T* input_segment_embedding = segment_embedding_data + segment_col_index * hidden_size; + + T sum = static_cast(0); + for (int i = 0; i < hidden_size; i++) { + T subtotal = input_word_embedding[i] + input_position_embedding[i] + input_segment_embedding[i]; + y[i] = subtotal; + sum += subtotal; + } + T mean = sum / hidden_size; + sum = 0; + for (int i = 0; i < hidden_size; i++) { + T a = y[i] - mean; + y[i] = a; + sum += a * a; + } + T e = sqrt(sum / hidden_size + static_cast(1.0e-13)); + for (int i = 0; i < hidden_size; i++) { + y[i] = y[i] / e * gamma_data[i] + beta_data[i]; + } + }); + + if (failed.load(std::memory_order_acquire)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "input index out of range"); } } From 04b6097db45610ddfa2c534ff0ea04589a1cc6ef Mon Sep 17 00:00:00 2001 From: Zhang Lei Date: Wed, 27 Nov 2019 12:42:45 -0800 Subject: [PATCH 07/13] Cuda Clip() for op set 11. (#2411) * Cuda Clip() for op set 11. * make min_val and max_value input CPU memory directly. * Remove original cu file useless "#pragma once" * merge duplicate logic into one class. --- .../providers/cuda/cuda_execution_provider.cc | 6 +- onnxruntime/core/providers/cuda/math/clip.cc | 57 +++++++++++++------ onnxruntime/core/providers/cuda/math/clip.h | 24 +++++--- .../core/providers/cuda/math/clip_impl.cu | 1 - 4 files changed, 60 insertions(+), 28 deletions(-) diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 3d55b58446..6b83987403 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -363,7 +363,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Ceil); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, Ceil); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, Ceil); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Clip); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 10, float, Clip); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Reciprocal); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, Reciprocal); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, Reciprocal); @@ -652,6 +652,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, MaxPool); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, MaxPool); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, MaxPool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Clip); static void RegisterCudaKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { @@ -676,7 +677,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -1099,6 +1100,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, }; for (auto& function_table_entry : function_table) { diff --git a/onnxruntime/core/providers/cuda/math/clip.cc b/onnxruntime/core/providers/cuda/math/clip.cc index df3d4dd77e..7bfa0f1fe4 100644 --- a/onnxruntime/core/providers/cuda/math/clip.cc +++ b/onnxruntime/core/providers/cuda/math/clip.cc @@ -8,38 +8,61 @@ namespace onnxruntime { namespace cuda { -#define REGISTER_KERNEL_TYPED(T) \ - ONNX_OPERATOR_TYPED_KERNEL_EX( \ - Clip, \ - kOnnxDomain, \ - 6, \ - T, \ - kCudaExecutionProvider, \ - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ +#define REGISTER_KERNEL_TYPED(T) \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ + Clip, \ + kOnnxDomain, \ + 6, \ + 10, \ + T, \ + kCudaExecutionProvider, \ + KernelDefBuilder() \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + Clip); \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + Clip, \ + kOnnxDomain, \ + 11, \ + T, \ + kCudaExecutionProvider, \ + KernelDefBuilder() \ + .InputMemoryType(1) \ + .InputMemoryType(2) \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ Clip); template Status Clip::ComputeInternal(OpKernelContext* ctx) const { + T min_val = min_; + T max_val = max_; + if (is_min_max_input_) { + const auto* min_input = ctx->Input(1); + const auto* max_input = ctx->Input(2); + if (min_input) { + ORT_ENFORCE(min_input->Shape().NumDimensions() == 0, "min should be a scalar."); + min_val = *(min_input->template Data()); + } + if (max_input) { + ORT_ENFORCE(max_input->Shape().NumDimensions() == 0, "max should be a scalar."); + max_val = *(max_input->template Data()); + } + ORT_ENFORCE(min_val <= max_val); + } + const Tensor& X = *ctx->Input(0); - const TensorShape input_shape{X.Shape()}; + const TensorShape& input_shape{X.Shape()}; Tensor* Y = ctx->Output(0, input_shape); - size_t count = input_shape.Size(); - if (count > 0) { auto* y_data = Y->template MutableData(); const auto* x_data = X.template Data(); - ClipImpl(x_data, y_data, min_, max_, count); + ClipImpl(x_data, y_data, min_val, max_val, count); } return Status::OK(); } -#define SPECIALIZED_COMPUTE(T) \ - REGISTER_KERNEL_TYPED(T) \ - template Status Clip::ComputeInternal(OpKernelContext* ctx) const; - -SPECIALIZED_COMPUTE(float) +REGISTER_KERNEL_TYPED(float) } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/math/clip.h b/onnxruntime/core/providers/cuda/math/clip.h index 1f748990c1..d600dc1e9c 100644 --- a/onnxruntime/core/providers/cuda/math/clip.h +++ b/onnxruntime/core/providers/cuda/math/clip.h @@ -10,21 +10,29 @@ namespace cuda { template class Clip final : public CudaKernel { public: - Clip(const OpKernelInfo& info) : CudaKernel{info} { - auto min_val = -std::numeric_limits::infinity(); - auto max_val = std::numeric_limits::infinity(); + Clip(const OpKernelInfo& info) : CudaKernel{info}, is_min_max_input_(false) { + int start_version; + int end_version; + info.GetKernelDef().SinceVersion(&start_version, &end_version); - info.GetAttrOrDefault("min", &min_, min_val); - info.GetAttrOrDefault("max", &max_, max_val); - - // Make sure the range of interval is sensible - ORT_ENFORCE(min_val <= max_val); + if (start_version < 11) { + auto min_val = -std::numeric_limits::infinity(); + auto max_val = std::numeric_limits::infinity(); + info.GetAttrOrDefault("min", &min_, min_val); + info.GetAttrOrDefault("max", &max_, max_val); + ORT_ENFORCE(min_ <= max_); + } else { + min_ = -std::numeric_limits::infinity(); + max_ = std::numeric_limits::infinity(); + is_min_max_input_ = true; + } } Status ComputeInternal(OpKernelContext* context) const override; private: T min_, max_; + bool is_min_max_input_; }; } // namespace cuda diff --git a/onnxruntime/core/providers/cuda/math/clip_impl.cu b/onnxruntime/core/providers/cuda/math/clip_impl.cu index 42d415e946..a73c5ad812 100644 --- a/onnxruntime/core/providers/cuda/math/clip_impl.cu +++ b/onnxruntime/core/providers/cuda/math/clip_impl.cu @@ -1,7 +1,6 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#pragma once #include "core/providers/cuda/math/clip_impl.h" #include "core/providers/cuda/cu_inc/common.cuh" From 1fdf1006ac312295d164b2cbc0f88698122a7b8f Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Thu, 28 Nov 2019 07:03:23 +1000 Subject: [PATCH 08/13] Various fixes coming out of discussions in #2436 (#2497) - Add --skip_tests option to build.py based on github feedback - Add debug output at end of run_subprocess so it's clearer when the output is from a different process running - Add check for scipy as it's required by gen_test_models.py for the onnx tests - Use log.warning instead of warnings.warn for consistency. We use the logger almost everywhere and somewhat randomly used warnings.warn in two places. - Add check for 'wheel' dependency not being found in setup.py and handle more gracefully - Fix invalid input name in Keras tests --- .../python/onnxruntime_test_python_keras.py | 2 +- setup.py | 11 ++++++-- tools/ci_build/build.py | 26 ++++++++++++++----- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/onnxruntime/test/python/onnxruntime_test_python_keras.py b/onnxruntime/test/python/onnxruntime_test_python_keras.py index 715a2a3490..77b788f234 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_keras.py +++ b/onnxruntime/test/python/onnxruntime_test_python_keras.py @@ -61,7 +61,7 @@ class TestInferenceSessionKeras(unittest.TestCase): # runtime content = converted_model.SerializeToString() rt = onnxrt.InferenceSession(content) - input = {'conv2d_1_input_0': x} + input = {rt.get_inputs()[0].name: x} actual_rt = rt.run(None, input) self.assertEqual(len(actual_rt), 1) np.testing.assert_allclose(actual, actual_rt[0], rtol=1e-05, atol=1e-08) diff --git a/setup.py b/setup.py index 0151c17457..a8b6282dac 100644 --- a/setup.py +++ b/setup.py @@ -110,7 +110,9 @@ try: logger.info('removing %s', file) remove(file) -except ImportError: +except ImportError as error: + print("Error importing dependencies:") + print(error) bdist_wheel = None # Additional binaries @@ -182,6 +184,11 @@ if nightly_build: date_suffix = str(datetime.datetime.now().date().strftime("%m%d")) version_number = version_number + ".dev" + date_suffix +cmd_classes = {} +if bdist_wheel is not None : + cmd_classes['bdist_wheel'] = bdist_wheel +cmd_classes['build_ext'] = build_ext + # Setup setup( name=package_name, @@ -190,7 +197,7 @@ setup( long_description=long_description, author='Microsoft Corporation', author_email='onnx@microsoft.com', - cmdclass={'bdist_wheel': bdist_wheel, 'build_ext': build_ext}, + cmdclass=cmd_classes, license="MIT License", packages=['onnxruntime', 'onnxruntime.backend', diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index be5fe0fc05..98d1d4a2d7 100755 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -14,7 +14,6 @@ import re import shutil import subprocess import sys -import warnings import hashlib from os.path import expanduser @@ -60,6 +59,7 @@ Use the individual flags to only run the specified stages. If you've done an update that fetched external dependencies you have to build without --parallel the first time. Once that's done, run with "--build --parallel --test" to just build in parallel and run tests.''') parser.add_argument("--test", action='store_true', help="Run unit tests.") + parser.add_argument("--skip_tests", action='store_true', help="Skip all tests.") # enable ONNX tests parser.add_argument("--enable_onnx_tests", action='store_true', @@ -150,7 +150,7 @@ Use the individual flags to only run the specified stages. parser.add_argument("--tensorrt_home", help="Path to TensorRT installation dir") parser.add_argument("--use_full_protobuf", action='store_true', help="Use the full protobuf library") parser.add_argument("--disable_contrib_ops", action='store_true', help="Disable contrib ops (reduces binary size)") - parser.add_argument("--skip_onnx_tests", action='store_true', help="Explicitly disable all onnx related tests") + parser.add_argument("--skip_onnx_tests", action='store_true', help="Explicitly disable all onnx related tests. Note: Use --skip_tests to skip all tests.") parser.add_argument("--enable_msvc_static_runtime", action='store_true', help="Enable static linking of MSVC runtimes.") parser.add_argument("--enable_language_interop_ops", action='store_true', help="Enable operator implemented in language other than cpp") parser.add_argument("--cmake_generator", choices=['Visual Studio 15 2017', 'Visual Studio 16 2019'], @@ -190,7 +190,9 @@ def run_subprocess(args, cwd=None, capture=False, dll_path=None, shell=False, en stdout, stderr = (subprocess.PIPE, subprocess.STDOUT) if capture else (None, None) my_env.update(env) - return subprocess.run(args, cwd=cwd, check=True, stdout=stdout, stderr=stderr, env=my_env, shell=shell) + completed_process = subprocess.run(args, cwd=cwd, check=True, stdout=stdout, stderr=stderr, env=my_env, shell=shell) + log.debug("Subprocess completed. Return code=" + str(completed_process.returncode)) + return completed_process def update_submodules(source_dir): run_subprocess(["git", "submodule", "sync", "--recursive"], cwd=source_dir) @@ -549,26 +551,33 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs, enab return if is_windows(): cwd = os.path.join(cwd, config) + run_subprocess([sys.executable, 'onnxruntime_test_python.py'], cwd=cwd, dll_path=dll_path) + try: import onnx + import scipy # gen_test_models.py used by onnx_test has a dependency on scipy onnx_test = True - except ImportError: - warnings.warn("onnx is not installed. Following test cannot be run.") + except ImportError as error: + log.exception(error) + log.warning("onnx or scipy is not installed. The ONNX tests will be skipped.") onnx_test = False + if onnx_test: run_subprocess([sys.executable, 'onnxruntime_test_python_backend.py'], cwd=cwd, dll_path=dll_path) - run_subprocess([sys.executable, os.path.join(source_dir,'onnxruntime','test','onnx','gen_test_models.py'),'--output_dir','test_models'], cwd=cwd) + run_subprocess([sys.executable, os.path.join(source_dir,'onnxruntime','test','onnx','gen_test_models.py'), + '--output_dir','test_models'], cwd=cwd) run_subprocess([os.path.join(cwd,'onnx_test_runner'), 'test_models'], cwd=cwd) if config != 'Debug': run_subprocess([sys.executable, 'onnx_backend_test_series.py'], cwd=cwd, dll_path=dll_path) + if not args.skip_keras_test: try: import onnxmltools import keras onnxml_test = True except ImportError: - warnings.warn("onnxmltools and keras are not installed. Following test cannot be run.") + log.warning("onnxmltools and keras are not installed. The keras tests will be skipped.") onnxml_test = False if onnxml_test: run_subprocess([sys.executable, 'onnxruntime_test_python_keras.py'], cwd=cwd, dll_path=dll_path) @@ -838,6 +847,9 @@ def main(): else: args.test = True + if args.skip_tests: + args.test = False + if args.use_tensorrt: args.use_cuda = True From 75b4747701b818a37af0dddca0d175f854e15964 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 27 Nov 2019 15:32:05 -0800 Subject: [PATCH 09/13] Fix a memleak in pybind. (#2503) --- onnxruntime/python/onnxruntime_pybind_mlvalue.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc index 491ca6e105..e2b68e121e 100644 --- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc +++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc @@ -233,7 +233,7 @@ void CreateSequenceOfTensors(AllocatorPtr alloc, const std::string& name_input, throw std::runtime_error("CreateSequenceOfTensors: Input is not a tensor"); } auto p_tensor = CreateTensor(alloc, name_input, reinterpret_cast(py_obj)); - p_seq_tensors->tensors[i] = std::move(*(p_tensor.release())); + p_seq_tensors->tensors[i] = std::move(*p_tensor); } } From ee0bde6b69154a18e8719fa6bf4324b0f38f1f38 Mon Sep 17 00:00:00 2001 From: Zhang Lei Date: Thu, 28 Nov 2019 03:03:43 -0800 Subject: [PATCH 10/13] Enable three type of Equal() to version 11. (#2508) --- .../providers/cuda/cuda_execution_provider.cc | 18 ++++++++++++------ .../cuda/math/binary_elementwise_ops.cc | 8 +++++++- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 6b83987403..b33be3c7e0 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -300,9 +300,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Greater); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Greater); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Greater); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, bool, Equal); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, int32_t, Equal); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, int64_t, Equal); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, bool, Equal); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, int32_t, Equal); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, int64_t, Equal); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, Expand); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int32_t, Greater); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int64_t, Greater); @@ -653,6 +653,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, MaxPool); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, MaxPool); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Clip); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, bool, Equal); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int32_t, Equal); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int64_t, Equal); static void RegisterCudaKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { @@ -749,9 +752,9 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -1101,6 +1104,9 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, }; for (auto& function_table_entry : function_table) { diff --git a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc index f2941ab5e2..23865e28e5 100644 --- a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc +++ b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc @@ -176,6 +176,11 @@ Status BinaryElementwise::Prepare(OpKernelContext* context, Bin BINARY_ELEMENTWISE_REGISTER_KERNEL_TYPED(name, ver, int32_t) \ BINARY_ELEMENTWISE_REGISTER_KERNEL_TYPED(name, ver, int64_t) +#define BINARY_OP_REGISTER_VERSIONED_OIL(name, startver, endver) \ + BINARY_ELEMENTWISE_REGISTER_KERNEL_VERSIONED_TYPED(name, startver, endver, bool) \ + BINARY_ELEMENTWISE_REGISTER_KERNEL_VERSIONED_TYPED(name, startver, endver, int32_t) \ + BINARY_ELEMENTWISE_REGISTER_KERNEL_VERSIONED_TYPED(name, startver, endver, int64_t) + #define BINARY_LOGICALOP_REGISTER_OIL(name, ver) \ BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(name, ver, bool) \ BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(name, ver, int32_t) \ @@ -379,7 +384,8 @@ Status Less::ComputeInternal(OpKernelContext* context) const { BINARY_OP_REGISTER_UZILHFD(Sum, 8) BINARY_OP_REGISTER_VERSIONED_UZILHFD(Sum, 6, 7) BINARY_LOGICALOP_REGISTER_UZILHFD(Greater, 9) -BINARY_OP_REGISTER_OIL(Equal, 7) +BINARY_OP_REGISTER_VERSIONED_OIL(Equal, 7, 10) +BINARY_OP_REGISTER_OIL(Equal, 11) BINARY_OP_REGISTER_VERSIONED_HFD(Greater, 7, 8) BINARY_OP_REGISTER_HFD(Max, 8) BINARY_OP_REGISTER_VERSIONED_HFD(Max, 6, 7) From 005305be6e201c7ac0d8619f6813027165c521e5 Mon Sep 17 00:00:00 2001 From: Yufeng Li Date: Thu, 28 Nov 2019 08:29:59 -0800 Subject: [PATCH 11/13] Implement AddGelu and SkipLayerNorm (#2487) * Implement AddGelu and SkipLayerNorm --- .../contrib_ops/cpu/bert/bias_gelu_fusion.cc | 76 +++++++++++ .../contrib_ops/cpu/bert/bias_gelu_fusion.h | 20 +++ onnxruntime/contrib_ops/cpu/layer_norm.cc | 70 +++++----- .../contrib_ops/cpu/skip_layer_norm.cc | 129 ++++++++++++++++++ onnxruntime/contrib_ops/cpu/skip_layer_norm.h | 21 +++ .../contrib_ops/cpu_contrib_kernels.cc | 8 +- .../cuda/math/binary_elementwise_ops.cc | 2 +- .../cuda/math/binary_elementwise_ops.h | 4 +- .../cuda/math/binary_elementwise_ops_impl.cu | 2 +- .../cuda/math/binary_elementwise_ops_impl.h | 2 +- .../contrib_ops/cuda_contrib_kernels.cc | 12 +- .../core/graph/contrib_ops/contrib_defs.cc | 22 ++- ...add_gelu_fusion.cc => bias_gelu_fusion.cc} | 25 +++- .../{add_gelu_fusion.h => bias_gelu_fusion.h} | 6 +- .../core/optimizer/gelu_approximation.cc | 4 +- .../core/optimizer/graph_transformer_utils.cc | 7 +- .../test/contrib_ops/element_wise_ops_test.cc | 24 +--- .../test/contrib_ops/layer_norm_op_test.cc | 4 +- .../test/contrib_ops/skiplayernorm_op_test.cc | 70 +++++----- .../test/optimizer/graph_transform_test.cc | 31 +---- .../optimizer/graph_transform_utils_test.cc | 2 +- .../transform/approximation/gelu.onnx | 4 +- .../approximation/gelu_add_bias.onnx | 8 +- .../approximation/gelu_add_matmul.onnx | 8 +- .../gelu_add_shape_not_match.onnx | 19 --- .../approximation/gelu_approximation_gen.py | 25 +--- ...gelu_fusion.onnx => bias_gelu_fusion.onnx} | Bin 419 -> 401 bytes .../{gelu_add_gen.py => bias_gelu_gen.py} | 4 +- 28 files changed, 402 insertions(+), 207 deletions(-) create mode 100644 onnxruntime/contrib_ops/cpu/bert/bias_gelu_fusion.cc create mode 100644 onnxruntime/contrib_ops/cpu/bert/bias_gelu_fusion.h create mode 100644 onnxruntime/contrib_ops/cpu/skip_layer_norm.cc create mode 100644 onnxruntime/contrib_ops/cpu/skip_layer_norm.h rename onnxruntime/core/optimizer/{add_gelu_fusion.cc => bias_gelu_fusion.cc} (69%) rename onnxruntime/core/optimizer/{add_gelu_fusion.h => bias_gelu_fusion.h} (63%) delete mode 100644 onnxruntime/test/testdata/transform/approximation/gelu_add_shape_not_match.onnx rename onnxruntime/test/testdata/transform/fusion/{add_gelu_fusion.onnx => bias_gelu_fusion.onnx} (89%) rename onnxruntime/test/testdata/transform/fusion/{gelu_add_gen.py => bias_gelu_gen.py} (90%) diff --git a/onnxruntime/contrib_ops/cpu/bert/bias_gelu_fusion.cc b/onnxruntime/contrib_ops/cpu/bert/bias_gelu_fusion.cc new file mode 100644 index 0000000000..8d193eaca9 --- /dev/null +++ b/onnxruntime/contrib_ops/cpu/bert/bias_gelu_fusion.cc @@ -0,0 +1,76 @@ +#include "bias_gelu_fusion.h" + +#include "core/util/math_cpuonly.h" +#include "core/platform/threadpool.h" +#include "core/mlas/inc/mlas.h" + +namespace onnxruntime { +namespace contrib { + +ONNX_OPERATOR_KERNEL_EX( + BiasGelu, + kMSDomain, + 1, + kCpuExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + BiasGelu); + +template +Status BiasGelu::Compute(OpKernelContext* ctx) const { + const Tensor* X = ctx->Input(0); + const auto input_dims = X->Shape().GetDims(); + if (input_dims.size() < 1) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Shape of Input 0 is expected to have at least 1 dimension, got ", input_dims.size()); + } + + const Tensor* B = ctx->Input(1); + const auto bias_dims = B->Shape().GetDims(); + if (bias_dims.size() != 1) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 1 is expected to have 1 dimension, got ", bias_dims.size()); + } + + int64_t bias_len = bias_dims[0]; + if (bias_len != input_dims[input_dims.size() - 1]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "dimension 0 of Input 1 should have same length as the last dimension of input 0"); + } + + Tensor* Y = ctx->Output(0, X->Shape()); + + AllocatorPtr alloc; + ORT_RETURN_IF_ERROR(ctx->GetTempSpaceAllocator(&alloc)); + + BufferUniquePtr temp_data_buf_ptr = BufferUniquePtr(alloc->Alloc(sizeof(T) * X->Shape().Size()), BufferDeleter(alloc)); + T* tmp_data = static_cast(temp_data_buf_ptr.get()); + + const T* X_data = X->template Data(); + const T* B_data = B->template Data(); + T* Y_data = Y->template MutableData(); + int64_t task_count = X->Shape().Size() / bias_len; + + concurrency::ThreadPool::TryBatchParallelFor(ctx->GetOperatorThreadPool(), + static_cast(task_count), + [&](int32_t task_idx) { + const T* p_input = X_data + task_idx * bias_len; + T* p_output = Y_data + task_idx * bias_len; + T* p_output_tmp = tmp_data + task_idx * bias_len; + + for (int64_t h = 0; h < bias_len; h++) { + T value = p_input[h] + B_data[h]; + p_output[h] = value * static_cast(M_SQRT1_2); + p_output_tmp[h] = value * 0.5f; + } + + MlasComputeErf(p_output, p_output, bias_len); + + for (int64_t h = 0; h < bias_len; h++) { + p_output[h] = p_output_tmp[h] * (p_output[h] + 1.0f); + } + }); + + return Status::OK(); +} +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/bert/bias_gelu_fusion.h b/onnxruntime/contrib_ops/cpu/bert/bias_gelu_fusion.h new file mode 100644 index 0000000000..f8ccb6c0ef --- /dev/null +++ b/onnxruntime/contrib_ops/cpu/bert/bias_gelu_fusion.h @@ -0,0 +1,20 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/common/common.h" +#include "core/framework/op_kernel.h" + +namespace onnxruntime { +namespace contrib { + +template +class BiasGelu : public OpKernel { + public: + explicit BiasGelu(const OpKernelInfo& info) : OpKernel(info) {} + Status Compute(OpKernelContext* context) const override; +}; + +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/layer_norm.cc b/onnxruntime/contrib_ops/cpu/layer_norm.cc index d6d4a16751..18f3664088 100644 --- a/onnxruntime/contrib_ops/cpu/layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/layer_norm.cc @@ -1,11 +1,13 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/framework/tensor.h" -#include "core/util/math_cpuonly.h" -#include "core/providers/common.h" #include "layer_norm.h" +#include "core/framework/tensor.h" +#include "core/platform/threadpool.h" +#include "core/providers/common.h" +#include "core/util/math_cpuonly.h" + namespace onnxruntime { namespace contrib { @@ -31,19 +33,22 @@ LayerNorm::LayerNorm(const OpKernelInfo& op_kernel_info) } template -Status LayerNorm::Compute(OpKernelContext* p_op_kernel_context) const { +Status LayerNorm::Compute(OpKernelContext* p_ctx) const { // Inputs - const Tensor* X = p_op_kernel_context->Input(0); - const Tensor* scale = p_op_kernel_context->Input(1); - const Tensor* bias = p_op_kernel_context->Input(2); + const Tensor* X = p_ctx->Input(0); + const Tensor* scale = p_ctx->Input(1); + const Tensor* bias = p_ctx->Input(2); auto X_data = X->template Data(); auto scale_data = scale->template Data(); auto bias_data = bias->template Data(); const TensorShape& x_shape = X->Shape(); const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions()); - auto N = x_shape.SizeToDimension(axis); - auto M = x_shape.SizeFromDimension(axis); + auto norm_count = x_shape.SizeToDimension(axis); + auto norm_size = x_shape.SizeFromDimension(axis); + + Tensor* Y = p_ctx->Output(0, x_shape); + auto Y_data = Y->template MutableData(); std::vector mean_inv_std_var_dim; mean_inv_std_var_dim.reserve(x_shape.NumDimensions()); @@ -56,16 +61,16 @@ Status LayerNorm::Compute(OpKernelContext* p_op_kernel_context) const { } AllocatorPtr alloc; - ORT_RETURN_IF_ERROR(p_op_kernel_context->GetTempSpaceAllocator(&alloc)); + ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc)); T* mean_data = nullptr; BufferUniquePtr mean_data_buf_ptr; - Tensor* mean = p_op_kernel_context->Output(1, TensorShape(mean_inv_std_var_dim)); + Tensor* mean = p_ctx->Output(1, TensorShape(mean_inv_std_var_dim)); if (mean != nullptr) { mean_data = mean->template MutableData(); } else { - auto mean_data_buf = alloc->Alloc(sizeof(T) * N); + auto mean_data_buf = alloc->Alloc(sizeof(T) * norm_count); mean_data_buf_ptr = BufferUniquePtr(mean_data_buf, BufferDeleter(alloc)); mean_data = static_cast(mean_data_buf_ptr.get()); } @@ -73,38 +78,39 @@ Status LayerNorm::Compute(OpKernelContext* p_op_kernel_context) const { T* inv_std_var_data = nullptr; BufferUniquePtr inv_std_var_data_buf_ptr; - Tensor* inv_std_var = p_op_kernel_context->Output(2, TensorShape(mean_inv_std_var_dim)); + Tensor* inv_std_var = p_ctx->Output(2, TensorShape(mean_inv_std_var_dim)); if (inv_std_var != nullptr) { inv_std_var_data = inv_std_var->template MutableData(); } else { - auto inv_std_var_data_buf = alloc->Alloc(sizeof(T) * N); + auto inv_std_var_data_buf = alloc->Alloc(sizeof(T) * norm_count); inv_std_var_data_buf_ptr = BufferUniquePtr(inv_std_var_data_buf, BufferDeleter(alloc)); inv_std_var_data = static_cast(inv_std_var_data_buf_ptr.get()); } - std::memset(mean_data, 0, sizeof(T) * N); - std::memset(inv_std_var_data, 0, sizeof(T) * N); + concurrency::ThreadPool::TryBatchParallelFor(p_ctx->GetOperatorThreadPool(), + static_cast(norm_count), + [&](int32_t task_idx) { + const T* p_input = X_data + task_idx * norm_size; + T* p_output = Y_data + task_idx * norm_size; - ConstEigenArrayMap X_arr(X_data, M, N); - for (int i = 0; i < N; ++i) { - mean_data[i] = X_arr.col(i).mean(); - inv_std_var_data[i] = X_arr.col(i).square().mean() - mean_data[i] * mean_data[i]; - } + T mean = 0; + T mean_square = 0; - // Compute Y = ((x - mean) * (inv_var) * scale + bias - Tensor* Y = p_op_kernel_context->Output(0, x_shape); - auto Y_data = Y->template MutableData(); - EigenArrayMap Y_arr(Y_data, M, N); + for (int64_t h = 0; h < norm_size; h++) { + mean += p_input[h]; + mean_square += p_input[h] * p_input[h]; + } - ConstEigenVectorArrayMap mean_arr(mean_data, N); - EigenVectorArrayMap inv_std_var_arr(inv_std_var_data, N); - inv_std_var_arr = (inv_std_var_arr + epsilon_).sqrt().inverse(); + mean = mean / norm_size; + mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon_); - Y_arr = (X_arr.rowwise() - mean_arr.transpose()).rowwise() * inv_std_var_arr.transpose(); + for (int64_t h = 0; h < norm_size; h++) { + p_output[h] = (p_input[h] - mean) / mean_square * scale_data[h] + bias_data[h]; + } - ConstEigenVectorArrayMap scale_arr(scale_data, M); - ConstEigenVectorArrayMap bias_arr(bias_data, M); - Y_arr = (Y_arr.colwise() * scale_arr).colwise() + bias_arr; + mean_data[task_idx] = mean; + inv_std_var_data[task_idx] = mean_square; + }); return Status::OK(); } diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc new file mode 100644 index 0000000000..7dd0ff155f --- /dev/null +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -0,0 +1,129 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/framework/tensor.h" +#include "core/util/math_cpuonly.h" +#include "core/providers/common.h" +#include "core/platform/threadpool.h" +#include "skip_layer_norm.h" + +namespace onnxruntime { +namespace contrib { + +#define REGISTER_KERNEL_TYPED(T) \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + SkipLayerNormalization, \ + kMSDomain, \ + 1, \ + T, \ + kCpuExecutionProvider, \ + KernelDefBuilder() \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + SkipLayerNorm); + +REGISTER_KERNEL_TYPED(float) +REGISTER_KERNEL_TYPED(double) + +template +SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info) + : OpKernel(op_kernel_info) { +} + +template +Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { + const Tensor* input = p_ctx->Input(0); + const Tensor* skip = p_ctx->Input(1); + const Tensor* gamma = p_ctx->Input(2); + const Tensor* beta = p_ctx->Input(3); + const Tensor* bias = p_ctx->Input(4); + Tensor* output = p_ctx->Output(0, input->Shape()); + + const auto input_dims = input->Shape().GetDims(); + if (input_dims.size() != 3) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "input is expected to have 3 dimensions, got ", input_dims.size()); + } + + if (input->Shape() != skip->Shape()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "skip is expected to have same shape as input"); + } + + const auto gamma_dims = gamma->Shape().GetDims(); + if (gamma_dims.size() != 1) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "gamma is expected to have 1 dimension, got ", gamma_dims.size()); + } + if (gamma_dims[0] != input_dims[2]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Last dimension of gamma and input does not match"); + } + + const auto beta_dims = beta->Shape().GetDims(); + if (beta_dims.size() != 1) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "beta is expected to have 1 dimension, got ", beta_dims.size()); + } + if (beta_dims[0] != input_dims[2]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Last dimension of beta and input does not match"); + } + + if (nullptr != bias) { + const auto bias_dims = bias->Shape().GetDims(); + if (bias_dims.size() != 1) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "bias is expected to have 1 dimension, got ", bias_dims.size()); + } + if (bias_dims[0] != input_dims[2]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Last dimension of bias and input does not match"); + } + } + + int64_t batch_size = input_dims[0]; + int64_t sequence_length = input_dims[1]; + int64_t hidden_size = input_dims[2]; + int64_t task_count = batch_size * sequence_length; + + const T* input_data = input->Data(); + const T* skip_data = skip->Data(); + const T* gamma_data = gamma->Data(); + const T* beta_data = beta->Data(); + const T* bias_data = bias == nullptr ? nullptr : bias->Data(); + + T* output_data = output->MutableData(); + + concurrency::ThreadPool::TryBatchParallelFor(p_ctx->GetOperatorThreadPool(), + static_cast(task_count), + [&](int32_t task_idx) { + const T* p_input = input_data + task_idx * hidden_size; + const T* p_skip = skip_data + task_idx * hidden_size; + T* p_output = output_data + task_idx * hidden_size; + + T mean = 0; + T mean_square = 0; + + for (int64_t h = 0; h < hidden_size; h++) { + T value = p_input[h] + p_skip[h]; + if (nullptr != bias_data) { + value += bias_data[h]; + } + p_output[h] = value; + mean += value; + mean_square += value * value; + } + + mean = mean / hidden_size; + mean_square = sqrt(mean_square / hidden_size - mean * mean + float(1e-12)); + + for (int64_t h = 0; h < hidden_size; h++) { + p_output[h] = (p_output[h] - mean) / mean_square * gamma_data[h] + beta_data[h]; + } + }); + + return Status::OK(); +} // namespace contrib + +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h new file mode 100644 index 0000000000..1ede7c8f23 --- /dev/null +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h @@ -0,0 +1,21 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/common/common.h" +#include "core/framework/op_kernel.h" +#include "core/framework/tensor.h" + +namespace onnxruntime { +namespace contrib { + +template +class SkipLayerNorm final : public OpKernel { + public: + SkipLayerNorm(const OpKernelInfo& op_kernel_info); + Status Compute(OpKernelContext* p_op_kernel_context) const override; +}; + +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu_contrib_kernels.cc index 5d7a9122c0..5e456aa653 100644 --- a/onnxruntime/contrib_ops/cpu_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/cpu_contrib_kernels.cc @@ -33,6 +33,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, CDist); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, CDist); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Gelu); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, BiasGelu); // This section includes all op kernel declarations for former experimental ops which have now been removed from onnx. // To maintain backward compatibility these are added as contrib ops. @@ -68,6 +69,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSNchwcDomai class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSNchwcDomain, 1, float, GlobalAveragePool); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float, LayerNormalization); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double, LayerNormalization); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SkipLayerNormalization); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipLayerNormalization); Status RegisterNchwcKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { @@ -112,6 +115,7 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, // These ops were experimental ops in onnx domain which have been removed now. We add them here as @@ -138,7 +142,9 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo}; + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo}; for (auto& function_table_entry : function_table) { ORT_RETURN_IF_ERROR(kernel_registry.Register(function_table_entry())); diff --git a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.cc b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.cc index 5449bbb491..1d32d89810 100644 --- a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.cc +++ b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.cc @@ -63,7 +63,7 @@ namespace cuda { CONTRIB_BINARY_OP_TYPED(name, ver, float) \ CONTRIB_BINARY_OP_TYPED(name, ver, double) -CONTRIB_BINARY_OP_HFD(AddGeluFusion, 1) +CONTRIB_BINARY_OP_HFD(BiasGelu, 1) } // namespace cuda } // namespace contrib diff --git a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.h b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.h index 69dba848a5..9807321d43 100644 --- a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.h +++ b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.h @@ -16,9 +16,9 @@ namespace cuda { // AddGelu fuse Add + Gelu template -class AddGeluFusion final : public BinaryElementwise { +class BiasGelu final : public BinaryElementwise { public: - AddGeluFusion(const OpKernelInfo& info) : BinaryElementwise(info) { + BiasGelu(const OpKernelInfo& info) : BinaryElementwise(info) { } Status ComputeInternal(OpKernelContext* context) const override; diff --git a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.cu b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.cu index db4fcb18bb..4f8cc34b59 100644 --- a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.cu +++ b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.cu @@ -87,7 +87,7 @@ CONTRIB_BINARY_OPS() // D: double // O: bool -CONTRIB_SPECIALIZED_BINARY_ELEMENTWISE_IMPL_HFD(AddGeluFusion) +CONTRIB_SPECIALIZED_BINARY_ELEMENTWISE_IMPL_HFD(BiasGelu) } // namespace cuda } // namespace contrib diff --git a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.h b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.h index ee6cd680ae..22f1daa8e1 100644 --- a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.h +++ b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.h @@ -13,7 +13,7 @@ namespace cuda { // 4. Add op kernel class definition in binary_elementwise_ops.h // 5. Add op kernel registration and compute specialization in binary_elementwise_ops.cc #define CONTRIB_BINARY_OPS() \ - CONTRIB_BINARY_OP_NAME_EXPR(AddGeluFusion, _Gelu(a + b)) + CONTRIB_BINARY_OP_NAME_EXPR(BiasGelu, _Gelu(a + b)) // NOTE that cu files are compiled with nvcc and should not refer to any onnxruntime headers // so struct BinaryElementwisePreparation cannot be used here diff --git a/onnxruntime/contrib_ops/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda_contrib_kernels.cc index 31790b899b..674546219a 100644 --- a/onnxruntime/contrib_ops/cuda_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/cuda_contrib_kernels.cc @@ -14,9 +14,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, Gelu); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, Gelu); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, Gelu); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, AddGeluFusion); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, AddGeluFusion); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, AddGeluFusion); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, BiasGelu); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, BiasGelu); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, BiasGelu); // These ops were experimental ops in onnx domain which have been removed now. We add them here as // contrib ops to maintain backward compatibility @@ -58,9 +58,9 @@ void RegisterCudaContribKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // These ops were experimental ops in onnx domain which have been removed now. We add them here as // contrib ops to maintain backward compatibility diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc index 0266703696..92937dad05 100644 --- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc @@ -1945,28 +1945,22 @@ inputs by their magnitude, rather than gates inputs by their sign as in ReLUs.)D "Constrain input and output types to float tensors.") .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput); - ONNX_CONTRIB_OPERATOR_SCHEMA(AddGeluFusion) + static const char* BiasGelu_ver1_doc = + R"DOC(Bias Gelu. +It's an extension of Gelu. It takes the sum of input A and bias input B as the input of Gelu activation. )DOC"; + ONNX_CONTRIB_OPERATOR_SCHEMA(BiasGelu) .SetDomain(kMSDomain) .SinceVersion(1) .SetSupportLevel(OpSchema::SupportType::EXPERIMENTAL) - .SetDoc("AddGeluFusion fuses Add+Gelu. The fused Add op is the parent node of the fused Gelu.") - .Input(0, "A", "The input data as Tensor that is the first input of fused Add.", "T") - .Input(1, "B", "The input data as Tensor that is the second input of fused Add.", "T") + .SetDoc(BiasGelu_ver1_doc) + .Input(0, "A", "The normal input data.", "T") + .Input(1, "B", "The bias input data that is a 1D tensor.", "T") .Output(0, "C", "The output.", "T") .TypeConstraint( "T", {"tensor(float16)", "tensor(float)", "tensor(double)"}, "Constrain input and output types to float tensors.") - .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { - propagateElemTypeFromInputToOutput(ctx, 0, 0); - - if (hasNInputShapes(ctx, 2)) { - bidirectionalBroadcastShapeInference( - ctx.getInputType(0)->tensor_type().shape(), - ctx.getInputType(1)->tensor_type().shape(), - *ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape()); - } - }); + .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput); RegisterBertSchemas(); diff --git a/onnxruntime/core/optimizer/add_gelu_fusion.cc b/onnxruntime/core/optimizer/bias_gelu_fusion.cc similarity index 69% rename from onnxruntime/core/optimizer/add_gelu_fusion.cc rename to onnxruntime/core/optimizer/bias_gelu_fusion.cc index 2c26f3aded..94b8c601dc 100644 --- a/onnxruntime/core/optimizer/add_gelu_fusion.cc +++ b/onnxruntime/core/optimizer/bias_gelu_fusion.cc @@ -2,7 +2,7 @@ // Licensed under the MIT License. #include "core/optimizer/initializer.h" -#include "core/optimizer/add_gelu_fusion.h" +#include "core/optimizer/bias_gelu_fusion.h" #include "core/graph/graph_utils.h" #include @@ -10,7 +10,7 @@ using namespace ONNX_NAMESPACE; using namespace ::onnxruntime::common; namespace onnxruntime { -Status AddGeluFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const { +Status BiasGelu::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const { GraphViewer graph_viewer(graph); const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder(); @@ -29,6 +29,21 @@ Status AddGeluFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, c continue; } + std::vector gelu_input; + const TensorShapeProto* add_input1_shape = node.MutableInputDefs()[0]->Shape(); + const TensorShapeProto* add_input2_shape = node.MutableInputDefs()[1]->Shape(); + if (add_input1_shape != nullptr && + add_input1_shape->dim_size() == 1) { + gelu_input.push_back(node.MutableInputDefs()[1]); + gelu_input.push_back(node.MutableInputDefs()[0]); + } else if (add_input2_shape != nullptr && + add_input2_shape->dim_size() == 1) { + gelu_input.push_back(node.MutableInputDefs()[0]); + gelu_input.push_back(node.MutableInputDefs()[1]); + } else { + continue; + } + auto next_node_itr = node.OutputNodesBegin(); if (next_node_itr == node.OutputNodesEnd()) { continue; @@ -47,10 +62,10 @@ Status AddGeluFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, c Node& add_node = node; Node& gelu_node = const_cast(next_node); - Node& gelu_add_fusion_node = graph.AddNode(graph.GenerateNodeName("AddGeluFusion"), - "AddGeluFusion", + Node& gelu_add_fusion_node = graph.AddNode(graph.GenerateNodeName("BiasGelu"), + "BiasGelu", "fused Add and Gelu", - {add_node.MutableInputDefs()}, + gelu_input, {}, {}, kMSDomain); diff --git a/onnxruntime/core/optimizer/add_gelu_fusion.h b/onnxruntime/core/optimizer/bias_gelu_fusion.h similarity index 63% rename from onnxruntime/core/optimizer/add_gelu_fusion.h rename to onnxruntime/core/optimizer/bias_gelu_fusion.h index 58caed2b87..9a2f8e5e91 100644 --- a/onnxruntime/core/optimizer/add_gelu_fusion.h +++ b/onnxruntime/core/optimizer/bias_gelu_fusion.h @@ -11,10 +11,10 @@ namespace onnxruntime { @Class GeluFusion Fuse Add + Gelu to GeluFusion */ -class AddGeluFusion : public GraphTransformer { +class BiasGelu : public GraphTransformer { public: - AddGeluFusion(const std::unordered_set& compatible_execution_providers = {}) noexcept - : GraphTransformer("AddGeluFusion", compatible_execution_providers) { + BiasGelu(const std::unordered_set& compatible_execution_providers = {}) noexcept + : GraphTransformer("BiasGelu", compatible_execution_providers) { } Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override; diff --git a/onnxruntime/core/optimizer/gelu_approximation.cc b/onnxruntime/core/optimizer/gelu_approximation.cc index 9982a3836d..494666dd9e 100644 --- a/onnxruntime/core/optimizer/gelu_approximation.cc +++ b/onnxruntime/core/optimizer/gelu_approximation.cc @@ -66,8 +66,8 @@ static bool CheckGeluInputShape(const NodeArg& input) { return nullptr != input_shape && input_shape->dim_size() >= 1; } -static bool IsCandidateNode(Node& node, const std::unordered_set& compatible_providers) { - if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "AddGeluFusion", {1}, kMSDomain)) { +static bool IsCandidateNode(const Node& node, const std::unordered_set& compatible_providers) { + if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "BiasGelu", {1}, kMSDomain)) { return graph_utils::IsSupportedProvider(node, compatible_providers) && IsSupportedDataType(node) && CheckInputShape(node, *(node.InputDefs()[0]), *(node.InputDefs()[1])); diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc index 3f0b6c1b43..e2ae44fa18 100644 --- a/onnxruntime/core/optimizer/graph_transformer_utils.cc +++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc @@ -18,7 +18,7 @@ #include "core/optimizer/shape_to_initializer.h" #include "core/optimizer/nchwc_transformer.h" #include "core/optimizer/free_dim_override_transformer.h" -#include "core/optimizer/add_gelu_fusion.h" +#include "core/optimizer/bias_gelu_fusion.h" #include "core/optimizer/gelu_fusion.h" #include "core/optimizer/gelu_approximation.h" #include "core/optimizer/layer_norm_fusion.h" @@ -128,12 +128,11 @@ std::vector> GenerateTransformers(TransformerL transformers.emplace_back(onnxruntime::make_unique(cpu_cuda_execution_providers)); transformers.emplace_back(onnxruntime::make_unique(cpu_cuda_execution_providers)); transformers.emplace_back(onnxruntime::make_unique(cpu_cuda_execution_providers)); + transformers.emplace_back(onnxruntime::make_unique(cpu_cuda_execution_providers)); + transformers.emplace_back(onnxruntime::make_unique(cpu_cuda_execution_providers)); std::unordered_set cuda_execution_providers = {onnxruntime::kCudaExecutionProvider}; - transformers.emplace_back(onnxruntime::make_unique(cuda_execution_providers)); transformers.emplace_back(onnxruntime::make_unique(cuda_execution_providers)); - transformers.emplace_back(onnxruntime::make_unique(cuda_execution_providers)); - #endif } break; diff --git a/onnxruntime/test/contrib_ops/element_wise_ops_test.cc b/onnxruntime/test/contrib_ops/element_wise_ops_test.cc index 8cd60df3f9..10da620c53 100644 --- a/onnxruntime/test/contrib_ops/element_wise_ops_test.cc +++ b/onnxruntime/test/contrib_ops/element_wise_ops_test.cc @@ -84,7 +84,7 @@ const std::vector ComputeGeluWithErf(const std::vector& input_data return output; } -static void RunAddGeluFusionTest( +static void RunBiasGeluTest( const std::vector& input_a_data, const std::vector& input_b_data, const std::vector& input_a_dims, @@ -92,20 +92,18 @@ static void RunAddGeluFusionTest( if (HasCudaEnvironment(0)) { std::vector output_data = ComputeGeluWithErf(Add_Simple(input_a_data, input_b_data)); - OpTester tester("AddGeluFusion", 1, onnxruntime::kMSDomain); + OpTester tester("BiasGelu", 1, onnxruntime::kMSDomain); const std::vector& output_dims = input_a_dims.size() >= input_b_dims.size() ? input_a_dims : input_b_dims; tester.AddInput("A", input_a_dims, input_a_data); tester.AddInput("B", input_b_dims, input_b_data); tester.AddOutput("C", output_dims, output_data); - std::vector> execution_providers; - execution_providers.push_back(DefaultCudaExecutionProvider()); - tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + tester.Run(); } } -TEST(AddGeluFusionTest, Two_One_Dim) { +TEST(BiasGeluTest, Two_One_Dim) { std::vector input_a_data = { 0.8f, -0.5f, 0.0f, 1.f, 0.5f, 0.2f, 0.3f, -0.6f}; @@ -113,19 +111,7 @@ TEST(AddGeluFusionTest, Two_One_Dim) { std::vector input_b_data = { -0.5f, 0.6f, 1.2f, 2.1f}; - RunAddGeluFusionTest(input_a_data, input_b_data, {2, 4}, {4}); -} - -TEST(AddGeluFusionTest, Two_Two_Dim) { - std::vector input_a_data = { - 0.8f, -0.5f, 0.0f, 1.f, - 0.5f, 0.2f, 0.3f, -0.6f}; - - std::vector input_b_data = { - -0.5f, 0.6f, 1.2f, 2.1f, - 0.4f, 0.6f, 0.2f, -0.4f}; - - RunAddGeluFusionTest(input_a_data, input_b_data, {2, 4}, {2, 4}); + RunBiasGeluTest(input_a_data, input_b_data, {2, 4}, {4}); } } // namespace test diff --git a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc index da0b447b48..4ae04d7b06 100644 --- a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc +++ b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc @@ -89,7 +89,7 @@ class LayerNormOpTester : public OpTester { } void Run() { #ifndef NDEBUG - run_called_ = true; + run_called_ = true; #endif std::vector cpu_fetches; std::vector cuda_fetches; @@ -110,7 +110,7 @@ class LayerNormOpTester : public OpTester { // Compare GPU with original subgraph if (DefaultCudaExecutionProvider()) { ASSERT_TRUE(cuda_fetches.size() == subgraph_fetches.size()); - for(size_t i = 0; i < cuda_fetches.size(); i++) { + for (size_t i = 0; i < cuda_fetches.size(); i++) { if (cuda_fetches[i].IsTensor() && subgraph_fetches[i].IsTensor()) { VLOGS_DEFAULT(1) << "Checking tensor " << i; CheckTensor(subgraph_fetches[i].Get(), cuda_fetches[i].Get(), 1e-3, 1e-3); diff --git a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc index 2a557f800e..0d239ca4a1 100644 --- a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc +++ b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc @@ -20,45 +20,45 @@ static void RunTest( int sequence_length, int hidden_size, bool use_float16 = false) { - int min_cuda_architecture = use_float16 ? 530 : 0; - if (HasCudaEnvironment(min_cuda_architecture)) { + // Input and output shapes + // Input 0 - input: (batch_size, sequence_length, hidden_size) + // Input 1 - skip : (batch_size, sequence_length, hidden_size) + // Input 2 - gamma: (hidden_size) + // Input 3 - beta : (hidden_size) + // Output : (batch_size, sequence_length, hidden_size) + std::vector input_dims = {batch_size, sequence_length, hidden_size}; + std::vector skip_dims = input_dims; + std::vector gamma_dims = {hidden_size}; + std::vector beta_dims = gamma_dims; + std::vector bias_dims = gamma_dims; + std::vector output_dims = input_dims; + + if (!use_float16) { OpTester test("SkipLayerNormalization", 1, onnxruntime::kMSDomain); + test.AddInput("input", input_dims, input_data); + test.AddInput("skip", skip_dims, skip_data); + test.AddInput("gamma", gamma_dims, gamma_data); + test.AddInput("beta", beta_dims, beta_data); - // Input and output shapes - // Input 0 - input: (batch_size, sequence_length, hidden_size) - // Input 1 - skip : (batch_size, sequence_length, hidden_size) - // Input 2 - gamma: (hidden_size) - // Input 3 - beta : (hidden_size) - // Output : (batch_size, sequence_length, hidden_size) - std::vector input_dims = {batch_size, sequence_length, hidden_size}; - std::vector skip_dims = input_dims; - std::vector gamma_dims = {hidden_size}; - std::vector beta_dims = gamma_dims; - std::vector bias_dims = gamma_dims; - std::vector output_dims = input_dims; - - if (use_float16) { - test.AddInput("input", input_dims, ToFloat16(input_data)); - test.AddInput("skip", skip_dims, ToFloat16(skip_data)); - test.AddInput("gamma", gamma_dims, ToFloat16(gamma_data)); - test.AddInput("beta", beta_dims, ToFloat16(beta_data)); - if (!bias_data.empty()) { - test.AddInput("bias", bias_dims, ToFloat16(bias_data)); - } - - test.AddOutput("output", output_dims, ToFloat16(output_data)); - } else { - test.AddInput("input", input_dims, input_data); - test.AddInput("skip", skip_dims, skip_data); - test.AddInput("gamma", gamma_dims, gamma_data); - test.AddInput("beta", beta_dims, beta_data); - if (!bias_data.empty()) { - test.AddInput("bias", bias_dims, bias_data); - } - - test.AddOutput("output", output_dims, output_data); + if (!bias_data.empty()) { + test.AddInput("bias", bias_dims, bias_data); } + test.AddOutput("output", output_dims, output_data); + test.Run(); + } else if (HasCudaEnvironment(530 /*min_cuda_architecture*/)) { + OpTester test("SkipLayerNormalization", 1, onnxruntime::kMSDomain); + test.AddInput("input", input_dims, ToFloat16(input_data)); + test.AddInput("skip", skip_dims, ToFloat16(skip_data)); + test.AddInput("gamma", gamma_dims, ToFloat16(gamma_data)); + test.AddInput("beta", beta_dims, ToFloat16(beta_data)); + + if (!bias_data.empty()) { + test.AddInput("bias", bias_dims, ToFloat16(bias_data)); + } + + test.AddOutput("output", output_dims, ToFloat16(output_data)); + std::vector> execution_providers; execution_providers.push_back(DefaultCudaExecutionProvider()); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index 9915505477..854365515c 100644 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -14,7 +14,7 @@ #include "core/optimizer/conv_activation_fusion.h" #include "core/optimizer/dropout_elimination.h" #include "core/optimizer/gemm_activation_fusion.h" -#include "core/optimizer/add_gelu_fusion.h" +#include "core/optimizer/bias_gelu_fusion.h" #include "core/optimizer/gelu_fusion.h" #include "core/optimizer/gelu_approximation.h" #include "core/optimizer/layer_norm_fusion.h" @@ -1082,15 +1082,15 @@ TEST(GraphTransformationTests, GeluFusionTest) { ASSERT_TRUE(op_to_count["Gelu"] == 1); } -TEST(GraphTransformationTests, AddGeluFusionTest) { - auto model_uri = MODEL_FOLDER "fusion/add_gelu_fusion.onnx"; +TEST(GraphTransformationTests, BiasGeluTest) { + auto model_uri = MODEL_FOLDER "fusion/bias_gelu_fusion.onnx"; std::shared_ptr p_model; ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK()); Graph& graph = p_model->MainGraph(); onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; graph_transformation_mgr.Register(onnxruntime::make_unique(), TransformerLevel::Level2); - graph_transformation_mgr.Register(onnxruntime::make_unique(), TransformerLevel::Level2); + graph_transformation_mgr.Register(onnxruntime::make_unique(), TransformerLevel::Level2); auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger()); ASSERT_TRUE(ret.IsOK()); std::map op_to_count = CountOpsInGraph(graph); @@ -1099,7 +1099,7 @@ TEST(GraphTransformationTests, AddGeluFusionTest) { ASSERT_TRUE(op_to_count["Erf"] == 0); ASSERT_TRUE(op_to_count["Mul"] == 0); ASSERT_TRUE(op_to_count["Gelu"] == 0); - ASSERT_TRUE(op_to_count["GeluFusion"] == 0); + ASSERT_TRUE(op_to_count["BiasGelu"] == 1); } // Test Gelu -> FastGelu @@ -1132,7 +1132,7 @@ TEST(GraphTransformationTests, GeluApproximation_Gelu_Add_Bias) { ASSERT_TRUE(ret.IsOK()); std::map op_to_count = CountOpsInGraph(graph); - EXPECT_EQ(op_to_count["AddGeluFusion"], 0); + EXPECT_EQ(op_to_count["BiasGelu"], 0); EXPECT_EQ(op_to_count["FastGelu"], 1); } @@ -1149,28 +1149,11 @@ TEST(GraphTransformationTests, GeluApproximation_Gelu_Add_MatMul) { ASSERT_TRUE(ret.IsOK()); std::map op_to_count = CountOpsInGraph(graph); - EXPECT_EQ(op_to_count["AddGeluFusion"], 0); + EXPECT_EQ(op_to_count["BiasGelu"], 0); EXPECT_EQ(op_to_count["MatMul"], 1); EXPECT_EQ(op_to_count["FastGelu"], 1); } -// Test AddGeluFusion with mis-match bias shape cannot convert to FastGelu. -TEST(GraphTransformationTests, GeluApproximation_Gelu_Add_Shape_Not_Match) { - auto model_uri = MODEL_FOLDER "approximation/gelu_add_shape_not_match.onnx"; - std::shared_ptr p_model; - ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK()); - Graph& graph = p_model->MainGraph(); - - onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; - graph_transformation_mgr.Register(onnxruntime::make_unique(), TransformerLevel::Level2); - auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger()); - ASSERT_TRUE(ret.IsOK()); - - std::map op_to_count = CountOpsInGraph(graph); - EXPECT_EQ(op_to_count["AddGeluFusion"], 1); - EXPECT_EQ(op_to_count["FastGelu"], 0); -} - TEST(GraphTransformationTests, LayerNormFusionTest) { auto model_uri = MODEL_FOLDER "fusion/layer_norm.onnx"; std::shared_ptr p_model; diff --git a/onnxruntime/test/optimizer/graph_transform_utils_test.cc b/onnxruntime/test/optimizer/graph_transform_utils_test.cc index 190a289807..4351fd873a 100644 --- a/onnxruntime/test/optimizer/graph_transform_utils_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_utils_test.cc @@ -51,7 +51,7 @@ TEST(GraphTransformerUtilsTests, TestGenerateGraphTransformers) { } } ASSERT_TRUE(rule_transformer && rule_transformer->RulesCount() == 1); - + transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level2, {}, custom_list); #ifndef DISABLE_CONTRIB_OPS ASSERT_TRUE(transformers.size() == 1); diff --git a/onnxruntime/test/testdata/transform/approximation/gelu.onnx b/onnxruntime/test/testdata/transform/approximation/gelu.onnx index b183a1fc84..1f18ac6f51 100644 --- a/onnxruntime/test/testdata/transform/approximation/gelu.onnx +++ b/onnxruntime/test/testdata/transform/approximation/gelu.onnx @@ -1,4 +1,4 @@ -:~ +:~ # ACGelu_1"Gelu: com.microsoft Gelu_NoBiasZ$ A @@ -10,4 +10,4 @@  batch seq_len -€B \ No newline at end of file +€B diff --git a/onnxruntime/test/testdata/transform/approximation/gelu_add_bias.onnx b/onnxruntime/test/testdata/transform/approximation/gelu_add_bias.onnx index b96cb4752a..0c9ad44787 100644 --- a/onnxruntime/test/testdata/transform/approximation/gelu_add_bias.onnx +++ b/onnxruntime/test/testdata/transform/approximation/gelu_add_bias.onnx @@ -1,7 +1,7 @@ -:¦ -8 +:¡ +3 A -BCAddGeluFusion_1" AddGeluFusion: com.microsoft Gelu_AddBiasZ$ +BCAddGeluFusion_1"BiasGelu: com.microsoft Gelu_AddBiasZ$ A  batch @@ -14,4 +14,4 @@  batch seq_len -€B \ No newline at end of file +€B diff --git a/onnxruntime/test/testdata/transform/approximation/gelu_add_matmul.onnx b/onnxruntime/test/testdata/transform/approximation/gelu_add_matmul.onnx index 688fa1cd68..ff9f5a0ce5 100644 --- a/onnxruntime/test/testdata/transform/approximation/gelu_add_matmul.onnx +++ b/onnxruntime/test/testdata/transform/approximation/gelu_add_matmul.onnx @@ -1,10 +1,10 @@ -:â +:Ý  A BCMatMul_1"MatMul -8 +3 C -DEAddGeluFusion_1" AddGeluFusion: com.microsoftMatMul_AddGeluFusionZ$ +DEAddGeluFusion_1"BiasGelu: com.microsoftMatMul_AddGeluFusionZ$ A  batch @@ -22,4 +22,4 @@  batch seq_len -€B \ No newline at end of file +€B diff --git a/onnxruntime/test/testdata/transform/approximation/gelu_add_shape_not_match.onnx b/onnxruntime/test/testdata/transform/approximation/gelu_add_shape_not_match.onnx deleted file mode 100644 index 5061df5da8..0000000000 --- a/onnxruntime/test/testdata/transform/approximation/gelu_add_shape_not_match.onnx +++ /dev/null @@ -1,19 +0,0 @@ -:Ä -8 -A -BCAddGeluFusion_1" AddGeluFusion: com.microsoftGelu_Add_ShapeNotMatchZ$ -A - -batch - seq_len -€Z$ -B - -batch - seq_len -€b$ -C - -batch - seq_len -€B \ No newline at end of file diff --git a/onnxruntime/test/testdata/transform/approximation/gelu_approximation_gen.py b/onnxruntime/test/testdata/transform/approximation/gelu_approximation_gen.py index 65dae0cbe8..3f2d02af91 100644 --- a/onnxruntime/test/testdata/transform/approximation/gelu_approximation_gen.py +++ b/onnxruntime/test/testdata/transform/approximation/gelu_approximation_gen.py @@ -24,7 +24,7 @@ onnx.save(model, r'gelu.onnx') graph = helper.make_graph( [ # nodes # Add node before Gelu - helper.make_node("AddGeluFusion", ["A", "B"], ["C"], "AddGeluFusion_1", domain="com.microsoft"), + helper.make_node("BiasGelu", ["A", "B"], ["C"], "AddGeluFusion_1", domain="com.microsoft"), ], "Gelu_AddBias", #name [ # inputs @@ -41,32 +41,11 @@ graph = helper.make_graph( model = helper.make_model(graph) onnx.save(model, r'gelu_add_bias.onnx') -graph = helper.make_graph( - [ # nodes - # Add node before Gelu - helper.make_node("AddGeluFusion", ["A", "B"], ["C"], "AddGeluFusion_1", domain="com.microsoft"), - ], - "Gelu_Add_ShapeNotMatch", #name - [ # inputs - helper.make_tensor_value_info('A', TensorProto.FLOAT, ['batch', 'seq_len', 3072]), - helper.make_tensor_value_info('B', TensorProto.FLOAT, ['batch', 'seq_len', 3072]), # Bias shape not matched for FastGelu - ], - [ # outputs - helper.make_tensor_value_info('C', TensorProto.FLOAT, ['batch', 'seq_len', 3072]), - ], - [ # initializers - ] -) - -model = helper.make_model(graph) -onnx.save(model, r'gelu_add_shape_not_match.onnx') - - graph = helper.make_graph( [ # nodes # Add node before Gelu helper.make_node("MatMul", ["A", "B"], ["C"], "MatMul_1"), - helper.make_node("AddGeluFusion", ["C", "D"], ["E"], "AddGeluFusion_1", domain="com.microsoft"), + helper.make_node("BiasGelu", ["C", "D"], ["E"], "AddGeluFusion_1", domain="com.microsoft"), ], "MatMul_AddGeluFusion", #name [ # inputs diff --git a/onnxruntime/test/testdata/transform/fusion/add_gelu_fusion.onnx b/onnxruntime/test/testdata/transform/fusion/bias_gelu_fusion.onnx similarity index 89% rename from onnxruntime/test/testdata/transform/fusion/add_gelu_fusion.onnx rename to onnxruntime/test/testdata/transform/fusion/bias_gelu_fusion.onnx index d495584a739e7ff0a51925fffd9ec25f1e9e3633..5adf9e1de1a280b3d97fa8a6d9b66f2b8a2b9c9d 100644 GIT binary patch delta 30 lcmZ3?Jdv4&gVm~IBTEP)j{p~=lMpu-CkLYt>*O*w|+1qc8D delta 26 hcmbQpyqKAVgVk!*MwSpp79}o5r^yzK@{= Date: Thu, 28 Nov 2019 08:35:56 -0800 Subject: [PATCH 12/13] [NupharEP] Enable parallel schedule (#2505) * [NupharEP] Enable parallel schedule * Update TVM with the fix to TVM threadpool to use OpenMP if possible * Add parallel schedule when trying to vectorize With this change, BERT squad perf on a 4-core (8 HT) CPU goes from 187ms to 150ms * Address CR, docs and cmake update * Doc fix * Fix mkl * Fix TVM windows build when using mklml --- cmake/CMakeLists.txt | 22 ++++- cmake/external/tvm | 2 +- .../Nuphar-ExecutionProvider.md | 6 ++ .../passes/scheduler/schedule_utils.cc | 21 +++-- .../codegen/passes/scheduler/schedule_utils.h | 5 ++ .../nuphar/common/nuphar_settings.cc | 17 +++- .../providers/nuphar/common/nuphar_settings.h | 3 + .../providers/nuphar/compiler/nuphar_handle.h | 2 +- .../compiler/nuphar_schedule_builder.cc | 4 +- .../compiler/x86/scheduler/nuphar_scheduler.h | 8 ++ .../x86/scheduler/ort_type_schedule.cc | 82 ++++++++++++++++++- .../x86/scheduler/tvm_rule_schedule.cc | 12 ++- .../nuphar/mti_x86/math/reduce_ops.cc | 3 +- .../nuphar/nuphar_execution_provider.cc | 4 +- .../nuphar/nuphar_execution_provider.h | 9 +- .../nuphar/nuphar_provider_factory.cc | 2 +- 16 files changed, 170 insertions(+), 32 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 3e3583b445..eba8c0a0bd 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -343,6 +343,11 @@ if (onnxruntime_USE_ACL) list(APPEND onnxruntime_EXTERNAL_LIBRARIES arm_compute acl arm_compute_graph arm_compute_core) endif() +# MKLML +if (onnxruntime_USE_MKLDNN OR onnxruntime_USE_MKLML) + include(mkldnn) +endif() + # TVM if (onnxruntime_USE_TVM) if (onnxruntime_USE_CUDA) @@ -352,6 +357,19 @@ if (onnxruntime_USE_TVM) set(USE_LLVM ON) add_definitions(-DUSE_TVM_WITH_LLVM) endif() + if (onnxruntime_USE_OPENMP) + set(USE_OPENMP "gnu") + endif() + if (onnxruntime_USE_MKLML) + set(USE_OPENMP "intel") + # make sure MKLML in ORT is used by TVM + if (WIN32) + set(OMP_LIBRARY ${MKLML_LIB_DIR}/${IOMP5MD_IMPORT_LIB}) + else() + set(OMP_LIBRARY ${MKLML_LIB_DIR}/${IOMP5MD_SHARED_LIB}) + endif() + endif() + add_subdirectory(${PROJECT_SOURCE_DIR}/external/tvm EXCLUDE_FROM_ALL) set_target_properties(tvm PROPERTIES FOLDER "External/tvm") set_target_properties(tvm_topi PROPERTIES FOLDER "External/tvm") @@ -501,10 +519,6 @@ include_directories( ${REPO_ROOT}/include/onnxruntime/core/session ) -if (onnxruntime_USE_MKLDNN OR onnxruntime_USE_MKLML) - include(mkldnn) -endif() - if(onnxruntime_USE_GEMMLOWP) add_definitions(-DUSE_GEMMLOWP=1) endif() diff --git a/cmake/external/tvm b/cmake/external/tvm index 9b3a424a91..c6e3efcdb0 160000 --- a/cmake/external/tvm +++ b/cmake/external/tvm @@ -1 +1 @@ -Subproject commit 9b3a424a91d6003db1993cdd7121e46696f220e8 +Subproject commit c6e3efcdb09aeda961a6badf76093ceac69db64d diff --git a/docs/execution_providers/Nuphar-ExecutionProvider.md b/docs/execution_providers/Nuphar-ExecutionProvider.md index 05c6d57bfc..17244fcb8d 100644 --- a/docs/execution_providers/Nuphar-ExecutionProvider.md +++ b/docs/execution_providers/Nuphar-ExecutionProvider.md @@ -17,6 +17,12 @@ You can use the Nuphar execution provider via the python wheel from the ONNX Run ## Performance and Accuracy Testing You can test your ONNX model's performance with [onnxruntime_perf_test](../../onnxruntime/test/perftest/README.md), or test accuracy with [onnx_test_runner](../../onnxruntime/test/onnx/README.txt). To run these tools with the Nuphar execution provider, please pass `-e nuphar` in command line options. +Please note that Nuphar uses TVM thread pool and parallel schedule for multi-thread inference performance. When building with OpenMP or MKLML, TVM thread pool would use gomp or iomp as its implementation; otherwise, TVM creates its own thread pool. Because of this, the current default parallel schedule policy is: +- Default to on for USE_OPENMP or USE_MKLML. User can use OMP_NUM_THREADS/MKL_NUM_THREADS to control TVM thread pool, as well as TVM_NUM_THREADS +- Default to off for none of above. User can use TVM_NUM_THREADS to control TVM thread pool. + +This choice is to ensure to get ideal performance with the different build options. When build with USE_OPENMP or USE_MKLML, users would have to avoid thread confliction from OpenMP or MKL with their inference invocations anyway, so parallel schedule is enable to leverage existing thread pool. When not building with gomp or iomp, TVM thread pool is turned off to avoid confliction with user threads. If needed, user can set env or settings with [NUPHAR_PARALLEL_MIN_WORKLOADS](../../onnxruntime/core/providers/nuphar/common/nuphar_settings.cc#L61) to 0 to disable parallel schedule, or to some non-zero value to enable parallel schedule. The non-zero value indicates the minimal number of elements being computed per thread when parallel schedule would be turned on. + ## Model Conversion and Quantization You may use Python script [model_editor.py](../../onnxruntime/core/providers/nuphar/scripts/model_editor.py) to turn LSTM/GRU/RNN ops to Scan ops for a given model, and then use [model_quantizer.py](../../onnxruntime/core/providers/nuphar/scripts/model_quantizer.py) to quantize MatMul ops into MatMulInteger ops. diff --git a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc index 8e86f13eaa..3595229bbe 100644 --- a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc +++ b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc @@ -58,6 +58,19 @@ bool InsertRootScheduleAndClosure( return true; } +// Check precondition for vectorize schedule +bool ShouldTryVectorization( + const tvm::Tensor& tensor, + ScheduleContext& ctx) { + auto it = ctx.scheduled_tensors.find(tensor->op.get()); + if (it != ctx.scheduled_tensors.end()) { + if (it->second > ScheduleType::ScheduleInline) { + return false; + } + } + return true; +} + // Check the schedule of tensor // If it is not scheduled, try to vectorize it. // Note TryVectorization has to use with compute_root. @@ -66,12 +79,8 @@ bool TryVectorization( const tvm::Tensor& tensor, int64_t natural_vector_size, ScheduleContext& ctx) { - auto it = ctx.scheduled_tensors.find(tensor->op.get()); - if (it != ctx.scheduled_tensors.end()) { - if (it->second > ScheduleType::ScheduleInline) { - return false; - } - } + if (!ShouldTryVectorization(tensor, ctx)) + return false; auto shape = tensor->shape; auto rank = shape.size(); diff --git a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h index f928928f30..757366b551 100644 --- a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h +++ b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h @@ -26,6 +26,11 @@ bool InsertRootScheduleAndClosure( const tvm::Tensor& tensor, ScheduleContext& ctx); +// Check precondition for vectorize schedule +bool ShouldTryVectorization( + const tvm::Tensor& tensor, + ScheduleContext& ctx); + // Check the schedule of tensor // If it is not scheduled, try to vectorize it. // Note TryVectorization has to use with compute_root. diff --git a/onnxruntime/core/providers/nuphar/common/nuphar_settings.cc b/onnxruntime/core/providers/nuphar/common/nuphar_settings.cc index 389e4f817b..271e12f50c 100644 --- a/onnxruntime/core/providers/nuphar/common/nuphar_settings.cc +++ b/onnxruntime/core/providers/nuphar/common/nuphar_settings.cc @@ -38,7 +38,8 @@ static const std::unordered_set valid_keys = { kNupharCacheSoName, kNupharCacheModelChecksum, kNupharCacheForceNoJIT, - kNupharCodeGenTarget}; + kNupharCodeGenTarget, + kNupharParallelMinWorkloads}; void SetDefaultOptions(std::map& options) { // create two temporary strings to get rid of the odr-use issue introduced @@ -56,6 +57,20 @@ void SetDefaultOptions(std::map& options) { std::string cache_so_name_opt(kNupharCacheSoName); std::string cache_so_name_default(kNupharCacheSoName_Default); options.insert(std::make_pair(cache_so_name_opt, cache_so_name_default)); + + std::string parallel_min_workloads_opt(kNupharParallelMinWorkloads); +#if defined(USE_OPENMP) || defined(USE_MKLML) + // a rough estimate of workloads based on static dimensions for each thread, when using parallel schedule + // user may change it to 0 to turn it off, + // or use OMP_NUM_THREADS to control TVM thread pool similar to control MKL + unsigned int parallel_min_workloads_default = 64; +#else + // turn off parallel schedule by default to avoid TVM thread pool confliction with others + // this is to ensure performance when user runs multiple inference threads, with each runs as single thread + // if needed, user can override it with settings, and use TVM_NUM_THREADS to control the thread pool + unsigned int parallel_min_workloads_default = 0; +#endif + options.insert(std::make_pair(parallel_min_workloads_opt, std::to_string(parallel_min_workloads_default))); } void CreateNupharCodeGenSettings(const NupharExecutionProviderInfo& info) { diff --git a/onnxruntime/core/providers/nuphar/common/nuphar_settings.h b/onnxruntime/core/providers/nuphar/common/nuphar_settings.h index 8e836b7e84..5d2c149186 100644 --- a/onnxruntime/core/providers/nuphar/common/nuphar_settings.h +++ b/onnxruntime/core/providers/nuphar/common/nuphar_settings.h @@ -45,6 +45,9 @@ constexpr static const char* kNupharActivations_DeepCpu = "deep_cpu_activation"; // Option to control nuphar code generation target (avx / avx2 / avx512) constexpr static const char* kNupharCodeGenTarget = "nuphar_codegen_target"; +// Option to control nuphar code to run with parallel schedule +constexpr static const char* kNupharParallelMinWorkloads = "nuphar_parallel_min_workloads"; + // cache version number (MAJOR.MINOR.PATCH) following https://semver.org/ // 1. MAJOR version when you make incompatible changes that old cache files no longer work, // 2. MINOR version when you add functionality in a backwards - compatible manner, and diff --git a/onnxruntime/core/providers/nuphar/compiler/nuphar_handle.h b/onnxruntime/core/providers/nuphar/compiler/nuphar_handle.h index 84be4555ba..1b5f91990d 100644 --- a/onnxruntime/core/providers/nuphar/compiler/nuphar_handle.h +++ b/onnxruntime/core/providers/nuphar/compiler/nuphar_handle.h @@ -28,7 +28,7 @@ struct NupharCodeGenHandle : codegen::CodeGenHandle { std::shared_ptr schedule_builder; // keep // maybe add a layout tvm_codegen::WeightLayoutRegistry* layout_registry; - bool enable_per_node_parallelized; // TODO: change to config + int64_t parallel_min_workloads; bool allow_unaligned_buffers; // move to another place diff --git a/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc b/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc index 3f54d50a56..47e7030551 100644 --- a/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc +++ b/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc @@ -9,6 +9,7 @@ #include "core/providers/nuphar/common/analysis/subgraph_codegen_stats.h" #include "core/providers/nuphar/compiler/x86/x86_target_info.h" +#include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h" // TODO change name space namespace onnxruntime { @@ -37,9 +38,8 @@ static void Traverse(const tvm::Tensor& tensor, if (is_real_output) { CodeGenTargetX86* target = dynamic_cast(ctx_codegen.GetCodeGenHandle()->codegen_target); ORT_ENFORCE(target != nullptr); - int64_t natural_vector_size = target->NaturalVectorWidth(tensor->dtype.bits()); - TryVectorization(tensor, natural_vector_size, ctx_schedule); // to x86 + TryVectorizationX86(tensor, ctx_codegen, ctx_schedule); InsertRootScheduleAndClosure(tensor, ctx_schedule); } diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h index a41642e7e3..96e0fbc584 100644 --- a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h +++ b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h @@ -39,5 +39,13 @@ bool InputRootScheduleWithVectorizationX86( tvm_codegen::CodeGenContext& ctx_codegen, tvm_codegen::ScheduleContext& ctx_sched); +bool TryParallelX86( + const tvm::Tensor& tensor, + int64_t to_dim, // fuse dims before to_dim for parallel schedule, 0 to fuse all but last dim + tvm_codegen::CodeGenContext& ctx_codegen, + tvm_codegen::ScheduleContext& ctx_sched); + +constexpr auto kNupharScheduleNoParallel = "nuphar_schedule_no_parallel"; + } // namespace nuphar } // namespace onnxruntime diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/ort_type_schedule.cc b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/ort_type_schedule.cc index d32e35796a..2cc444dc18 100644 --- a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/ort_type_schedule.cc +++ b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/ort_type_schedule.cc @@ -3,26 +3,99 @@ #include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h" +#include "core/codegen/passes/scheduler/schedule_utils.h" +#include "core/framework/op_kernel_info.h" +#include "core/providers/nuphar/common/nuphar_settings.h" #include "core/providers/nuphar/common/analysis/subgraph_codegen_stats.h" #include "core/providers/nuphar/compiler/nuphar_codegen_ctx.h" -#include "core/codegen/passes/scheduler/schedule_utils.h" #include "core/providers/nuphar/compiler/x86/scheduler/tensorize/intrin_gemv_ll_extern.h" #include "core/providers/nuphar/compiler/x86/scheduler/tensorize/intrin_gemv_ll_ir.h" #include "core/providers/nuphar/compiler/x86/x86_target_info.h" -#include "core/framework/op_kernel_info.h" #include +#include namespace onnxruntime { namespace nuphar { +bool TryParallelX86( + const tvm::Tensor& tensor, + int64_t to_dim, + tvm_codegen::CodeGenContext& ctx_codegen, + tvm_codegen::ScheduleContext& ctx_sched) { + auto compute_op = tensor->op.as(); + if (compute_op == nullptr) { + return false; + } + if (compute_op->attrs.count(kNupharScheduleNoParallel)) { + return false; + } + + const auto& shape = tensor->shape; + + int rank = gsl::narrow(shape.size()); + tvm::Array to_fuse_for_parallel; + int64_t rank_to_parallel = (to_dim ? to_dim : rank - 1); + for (int64_t i = 0; i < rank_to_parallel && i < gsl::narrow(compute_op->axis.size()); ++i) { + tvm::IterVar axis = compute_op->axis[i]; + auto dom = axis->dom; + if (!tvm::ir::Equal(dom->extent, shape[i])) { + // only do parallel schedule on axis not being fused or split yet + rank_to_parallel = i; + break; + } + to_fuse_for_parallel.push_back(axis); + } + + if (to_fuse_for_parallel.size() < 1) { + return false; + } + + int64_t per_thread_static_dims = 1; + for (const auto& reduce_axis : compute_op->reduce_axis) { + const int64_t* static_range = tvm::as_const_int(reduce_axis->dom->extent); + if (static_range != nullptr) { + per_thread_static_dims *= *static_range; + } + } + for (int64_t i = rank_to_parallel; i < rank; ++i) { + auto dim = tvm::as_const_int(shape[i]); + if (dim != nullptr) { + per_thread_static_dims *= *dim; + } + } + + // skip small per thread workloads, note that symbolic dims are ignored (treated as 1) + int64_t workloads_threshold = Promote(&ctx_codegen)->GetCodeGenHandle()->parallel_min_workloads; + if (workloads_threshold <= 0 || per_thread_static_dims < workloads_threshold) { + return false; + } + + tvm::IterVar parallel_axis; + if (to_fuse_for_parallel.size() > 1) { + ctx_sched.schedule[tensor->op].fuse(to_fuse_for_parallel, ¶llel_axis); + } else { + parallel_axis = to_fuse_for_parallel[0]; + } + ctx_sched.schedule[tensor->op].parallel(parallel_axis); + return true; +} + bool TryVectorizationX86( const tvm::Tensor& tensor, tvm_codegen::CodeGenContext& ctx_codegen, tvm_codegen::ScheduleContext& ctx_sched) { + if (!ShouldTryVectorization(tensor, ctx_sched)) + return false; + CodeGenTargetX86* target = dynamic_cast(ctx_codegen.GetCodeGenHandle()->codegen_target); ORT_ENFORCE(target != nullptr); int64_t natural_vector_size = target->NaturalVectorWidth(tensor->dtype.bits()); + // try to use parallel schedule when vectorizing + // note that we don't do logic-or in return value here + // to make sure vectorization is always tried + TryParallelX86(tensor, 0, ctx_codegen, ctx_sched); + return TryVectorization(tensor, natural_vector_size, ctx_sched); } @@ -176,7 +249,7 @@ static Status ConvScheduleX86(const tvm::Tensor& tensor, ctx_sched.schedule[tensor->op].reorder({b, oc_chunk, y, xo, ic_chunk, m, n, ic_block, xi, oc_block}); - if (ctx_codegen.GetCodeGenHandle()->enable_per_node_parallelized) { + if (ctx_codegen.GetCodeGenHandle()->parallel_min_workloads > 0) { tvm::Array fused_axis; fused_axis.push_back(b); fused_axis.push_back(oc_chunk); @@ -186,6 +259,7 @@ static Status ConvScheduleX86(const tvm::Tensor& tensor, ctx_sched.schedule[tensor->op].fuse(fused_axis, ¶llel_axis); ctx_sched.schedule[tensor->op].parallel(parallel_axis); } + ctx_sched.schedule[tensor->op].vectorize(oc_block); return Status::OK(); @@ -243,7 +317,7 @@ static Status MatMul_2DWeight_Schedule( ctx_sched.schedule[CC->op].unroll(ki); ctx_sched.schedule[CC->op].vectorize(yc); - if (ctx_codegen.GetCodeGenHandle()->enable_per_node_parallelized) { + if (ctx_codegen.GetCodeGenHandle()->parallel_min_workloads > 0) { // parallelize tvm::Array fused_axis; for (size_t d = 0; d < C_rank - 2; ++d) diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/tvm_rule_schedule.cc b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/tvm_rule_schedule.cc index 646f22d7d7..d3f923f4e6 100644 --- a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/tvm_rule_schedule.cc +++ b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/tvm_rule_schedule.cc @@ -22,6 +22,7 @@ bool TVM_SCHEDULER_CLASS(Extern, NupharX86TVMRule)::Evaluate( static bool ReduceVScheduleNupharX86( const tvm::Tensor& tensor, + tvm_codegen::CodeGenContext& ctx_codegen, tvm_codegen::ScheduleContext& ctx_sched) { InsertRootScheduleAndClosure(tensor, ctx_sched); @@ -55,6 +56,8 @@ static bool ReduceVScheduleNupharX86( if (shape.size() > 0) head_dim = as_const_int(shape[0]); + bool try_parallel = true; + // unroll packed reduce by checking head dim if (nullptr != head_dim) { // if head_dim is already fused, don't unroll @@ -81,8 +84,13 @@ static bool ReduceVScheduleNupharX86( ctx_sched.schedule[tensor->op].reorder(reorder_axis); ctx_sched.schedule[tensor->op].unroll(x0); + try_parallel = false; } } + + if (try_parallel) { + TryParallelX86(tensor, *fuse_dim, ctx_codegen, ctx_sched); + } } else if (compute_op->axis.size() > 0 && tvm::as_const_int(tensor->shape[0]) != nullptr) { tvm::IterVar x = compute_op->axis[0]; @@ -101,7 +109,7 @@ static bool ReduceVScheduleNupharX86( bool TVM_SCHEDULER_CLASS(Reduce, NupharX86TVMRule)::Evaluate( const tvm::Tensor& tensor, const Node*, - tvm_codegen::CodeGenContext&, + tvm_codegen::CodeGenContext& ctx_codegen, tvm_codegen::ScheduleContext& ctx_sched) { // respect topi::kCommReduce if (tensor->op->tag == topi::kCommReduce) { @@ -109,7 +117,7 @@ bool TVM_SCHEDULER_CLASS(Reduce, NupharX86TVMRule)::Evaluate( } if (tensor->op->tag == nuphar::kNupharVReduce) { - return ReduceVScheduleNupharX86(tensor, ctx_sched); + return ReduceVScheduleNupharX86(tensor, ctx_codegen, ctx_sched); } // unknown goes to InsertRootScheduleAndClosure diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc index e816c77ed9..2bcb6f0010 100644 --- a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc +++ b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc @@ -7,6 +7,7 @@ #include "core/codegen/mti/mti_tvm_utils.h" #include "core/codegen/mti/tensor/pad_ops.h" #include "core/codegen/mti/tensor/reshape_ops.h" +#include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h" #include namespace onnxruntime { @@ -85,7 +86,7 @@ tvm::Tensor ReduceValueWithoutSplit(const tvm::Tensor& X, tvm::Map attrs; attrs.Set(kNupharVReduceFuseDim, tvm::Expr(fuse_dim)); - + attrs.Set(kNupharScheduleNoParallel, tvm::Expr(true)); return tvm::compute(output_shape, l_out, name + "_regular_reduce", kNupharVReduce, attrs); } diff --git a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc index b36d526828..d244c51d30 100644 --- a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc +++ b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc @@ -125,8 +125,8 @@ NupharExecutionProvider::NupharExecutionProvider(const NupharExecutionProviderIn handle->shape_inference = whole_graph_shape_infer_; - // TODO: remove - handle->enable_per_node_parallelized = info.enable_per_node_parallel; + handle->parallel_min_workloads = std::stoi(settings.GetOptionValue(kNupharParallelMinWorkloads)); + // TODO: remove handle->allow_unaligned_buffers = info.allow_unaligned_buffers; // TODO remove this diff --git a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.h b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.h index 40c7821032..07ca11e4f0 100644 --- a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.h +++ b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.h @@ -31,9 +31,6 @@ constexpr const char* default_nuphar_target_str = stackvm_target_str; // Information needed to construct Nuphar execution providers. struct NupharExecutionProviderInfo { - // By default, let provider decide the target by passing in empty string. - bool enable_per_node_parallel; // TODO: remove - // this flag set TVM build_config with data_alignment=1, at the cost of performance bool allow_unaligned_buffers; @@ -43,10 +40,8 @@ struct NupharExecutionProviderInfo { std::string settings; explicit NupharExecutionProviderInfo(bool unaligned_buffers, - const std::string& str_settings = "", - bool per_node_parallel = true) - : enable_per_node_parallel(per_node_parallel), - allow_unaligned_buffers(unaligned_buffers), + const std::string& str_settings = "") + : allow_unaligned_buffers(unaligned_buffers), settings(str_settings) {} NupharExecutionProviderInfo() = default; }; diff --git a/onnxruntime/core/providers/nuphar/nuphar_provider_factory.cc b/onnxruntime/core/providers/nuphar/nuphar_provider_factory.cc index 4fdcbfa277..e5c2989e51 100644 --- a/onnxruntime/core/providers/nuphar/nuphar_provider_factory.cc +++ b/onnxruntime/core/providers/nuphar/nuphar_provider_factory.cc @@ -21,7 +21,7 @@ struct NupharExecutionProviderFactory : IExecutionProviderFactory { }; std::unique_ptr NupharExecutionProviderFactory::CreateProvider() { - NupharExecutionProviderInfo info(allow_unaligned_buffers_, settings_, /*per_node_parallel*/ true); + NupharExecutionProviderInfo info(allow_unaligned_buffers_, settings_); return onnxruntime::make_unique(info); } From 0edd4ef6ca7323659693eb92b3c9b3a97f13970d Mon Sep 17 00:00:00 2001 From: liuziyue Date: Thu, 28 Nov 2019 14:03:58 -0800 Subject: [PATCH 13/13] EmbedLayerNormalization fusion (#2452) Embed Layer Normalization Fusion --- .../core/optimizer/embed_layer_norm_fusion.cc | 270 ++++++++++++++++++ .../core/optimizer/embed_layer_norm_fusion.h | 24 ++ .../core/optimizer/graph_transformer_utils.cc | 2 + .../core/optimizer/skip_layer_norm_fusion.h | 7 +- .../test/optimizer/graph_transform_test.cc | 43 +++ .../fusion/embed_layer_norm_format1.onnx | Bin 0 -> 940 bytes .../fusion/embed_layer_norm_format2.onnx | Bin 0 -> 1060 bytes 7 files changed, 341 insertions(+), 5 deletions(-) create mode 100644 onnxruntime/core/optimizer/embed_layer_norm_fusion.cc create mode 100644 onnxruntime/core/optimizer/embed_layer_norm_fusion.h create mode 100644 onnxruntime/test/testdata/transform/fusion/embed_layer_norm_format1.onnx create mode 100644 onnxruntime/test/testdata/transform/fusion/embed_layer_norm_format2.onnx diff --git a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc new file mode 100644 index 0000000000..c2f6136297 --- /dev/null +++ b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc @@ -0,0 +1,270 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +#include "core/optimizer/initializer.h" +#include "core/optimizer/embed_layer_norm_fusion.h" +#include "core/graph/graph_utils.h" +#include "float.h" + +#define DEBUG_LOG(x) LOGS(logger, VERBOSE) << x + +using namespace ONNX_NAMESPACE; +using namespace onnxruntime::common; +namespace onnxruntime { + +// Add a Cast to convert Input from int64 to int32. +static NodeArg* CastToInt32(Graph& graph, NodeArg* input, ProviderType provider_type) { + const TensorShapeProto* input_shape = input->Shape(); + TypeProto input_int32; + input_int32.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT32); + auto dim0 = input_int32.mutable_tensor_type()->mutable_shape()->add_dim(); + *dim0 = input_shape->dim(0); + auto dim1 = input_int32.mutable_tensor_type()->mutable_shape()->add_dim(); + *dim1 = input_shape->dim(1); + auto& cast32 = graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(input->Name() + "_Int32"), &input_int32); + + Node& node = graph.AddNode(graph.GenerateNodeName(input->Name() + "_Cast"), + "Cast", + "Cast Input from int64 to int32", + {input}, + {&cast32}, + nullptr, + kOnnxDomain); + + // Add attribute: "to" = 6 + ONNX_NAMESPACE::AttributeProto to; + to.set_name("to"); + to.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT); + to.set_i(static_cast(ONNX_NAMESPACE::TensorProto_DataType_INT32)); + node.AddAttribute("to", to); + + node.SetExecutionProviderType(provider_type); + return &cast32; +} + +static NodeArg* CheckInput(Graph& graph, NodeArg* input, ProviderType provider_type, const logging::Logger& logger) { + // Validate input shape (batch_size, sequence_length) and data type. + // Note that batch_size and sequence_length could be symbolic. + const TensorShapeProto* input_shape = input->Shape(); + if (input_shape == nullptr || input_shape->dim_size() != 2 || input->Type() == nullptr) { + DEBUG_LOG("Mask shape is unknown or not 2D, or data type unknown"); + return nullptr; + } + + auto data_type = input->TypeAsProto()->tensor_type().elem_type(); + if (data_type != ONNX_NAMESPACE::TensorProto_DataType_INT64 && + data_type != ONNX_NAMESPACE::TensorProto_DataType_INT32) { + DEBUG_LOG("Input data type is not int32 or int64"); + return nullptr; + } + + if (data_type == ONNX_NAMESPACE::TensorProto_DataType_INT64) { + return CastToInt32(graph, input, provider_type); + } + return input; +} + +/** +Embed Layer Normalization will fuse embeddings and mask processing into one node : +The embeddings before conversion: + (input_ids) --------> Gather ----------+ (segment_ids) + | | | + | v v + +--> Shape --> Expand -> Gather---->Add Gather + | ^ | | + | | v v + +---(optional graph) SkipLayerNormalization + +*/ +Status EmbedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const { + GraphViewer graph_viewer(graph); + const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder(); + for (auto node_index : node_topology_list) { + auto* p_layer_norm = graph.GetNode(node_index); + if (p_layer_norm == nullptr) + continue; // we removed the node as part of an earlier fusion + + Node& layer_norm_node = *p_layer_norm; + ORT_RETURN_IF_ERROR(Recurse(layer_norm_node, modified, graph_level, logger)); + if (!graph_utils::IsSupportedOptypeVersionAndDomain(layer_norm_node, "LayerNormalization", {1}, kOnnxDomain) || + !graph_utils::IsSupportedProvider(layer_norm_node, GetCompatibleExecutionProviders())) { + continue; + } + // Find Attention after SkipLayerNormalization + const Node* p_attention = graph_utils::FirstChildByType(layer_norm_node, "Attention"); + // Stop EmbedLayerNormalization fusion if Attention is not found. + if (p_attention == nullptr) { + return Status::OK(); + } + Node& attention_node = *graph.GetNode(p_attention->Index()); + if (!graph_utils::IsSupportedOptypeVersionAndDomain(attention_node, "Attention", {1}, kMSDomain) || + !graph_utils::IsSupportedProvider(attention_node, GetCompatibleExecutionProviders())) { + continue; + } + // Find ReduceSum --> Attention + std::vector edges; + if (!graph_utils::FindPath(attention_node, true, {{0, 3, "ReduceSum", {1, 11}, kOnnxDomain}}, edges, logger)) { + continue; + } + Node& reduce_sum_node = *graph.GetNode(edges[0]->GetNode().Index()); + + // Find Add --> LayerNormalization + if (!graph_utils::FindPath(layer_norm_node, true, {{0, 0, "Add", {7}, kOnnxDomain}}, edges, logger)) { + continue; + } + Node& layer_norm_add_node = *graph.GetNode(edges[0]->GetNode().Index()); + + // Traceback the SkipLayerNormalization node to find Gather --> SkipLayerNormalization + std::vector segment_embedding_path{ + {0, 1, "Gather", {1, 11}, kOnnxDomain}}; + if (!graph_utils::FindPath(layer_norm_add_node, true, segment_embedding_path, edges, logger)) { + continue; + } + Node& segment_gather_node = *graph.GetNode(edges[0]->GetNode().Index()); + if (segment_gather_node.GetOutputEdgesCount() != 1) { + continue; + } + // The first input of segment_gather_node must be 2d. + auto sg_shape = segment_gather_node.MutableInputDefs()[0]->Shape(); + if (sg_shape != nullptr && sg_shape->dim_size() != 2) { + continue; + } + + // Traceback the SkipLayerNormalization node to find Gather --> Add --> SkipLayerNormalization + std::vector word_embedding_path{ + {0, 0, "Add", {7}, kOnnxDomain}, + {0, 0, "Gather", {1, 11}, kOnnxDomain}}; + if (!graph_utils::FindPath(layer_norm_add_node, true, word_embedding_path, edges, logger)) { + continue; + } + Node& add_node = *graph.GetNode(edges[0]->GetNode().Index()); + Node& word_gather_node = *graph.GetNode(edges[1]->GetNode().Index()); + if (add_node.GetOutputEdgesCount() != 1 || word_gather_node.GetOutputEdgesCount() != 1) { + continue; + } + // The first input of word_gather_node must be 2d. + auto wg_shape = word_gather_node.MutableInputDefs()[0]->Shape(); + if (wg_shape != nullptr && wg_shape->dim_size() != 2) { + continue; + } + + // Traceback the Add node to find (Shape --> Expand -->) Gather --> Add. + // Constant folding removes Shape and Expand nodes when input does not have symbolic shape. In that + // case just look for Gather --> Add. + std::vector position_embedding_path{ + {0, 1, "Gather", {1, 11}, kOnnxDomain}}; + if (!graph_utils::FindPath(add_node, true, position_embedding_path, edges, logger)) { + continue; + } + Node& position_gather_node = *graph.GetNode(edges[0]->GetNode().Index()); + if (position_gather_node.GetOutputEdgesCount() != 1) { + continue; + } + // The first input of position_gather_node must be 2d. + auto pg_shape = position_gather_node.MutableInputDefs()[0]->Shape(); + if (pg_shape != nullptr && pg_shape->dim_size() != 2) { + continue; + } + + // Match Shape --> Expand path if needed. + std::vector position_embedding_path_symbolic{ + {0, 1, "Expand", {8}, kOnnxDomain}, + {0, 1, "Shape", {1}, kOnnxDomain}}; + Node* p_expand_node = nullptr; + Node* p_shape_node = nullptr; + if (graph_utils::FindPath(position_gather_node, true, position_embedding_path_symbolic, edges, logger)) { + if (edges[0]->GetNode().GetOutputEdgesCount() == 1 && edges[1]->GetNode().GetOutputEdgesCount() == 1) { + p_expand_node = graph.GetNode(edges[0]->GetNode().Index()); + p_shape_node = graph.GetNode(edges[1]->GetNode().Index()); + } + } + + // Get input "input_ids" from node. + NodeArg* input_ids = CheckInput(graph, word_gather_node.MutableInputDefs()[1], layer_norm_node.GetExecutionProviderType(), logger); + if (input_ids == nullptr) { + DEBUG_LOG("Input id is not valid. "); + continue; + } + + // Get input "segment_ids" from node. + NodeArg* segment_ids = CheckInput(graph, segment_gather_node.MutableInputDefs()[1], layer_norm_node.GetExecutionProviderType(), logger); + if (segment_ids == nullptr) { + DEBUG_LOG("Segment id is not valid. "); + continue; + } + + // Get input "mask" from "ReduceSum" node. + NodeArg* mask = CheckInput(graph, reduce_sum_node.MutableInputDefs()[0], layer_norm_node.GetExecutionProviderType(), logger); + if (mask == nullptr) { + DEBUG_LOG("Mask is not valid. "); + continue; + } + + const std::vector embed_layer_norm_input_defs{ + input_ids, + segment_ids, + mask, + word_gather_node.MutableInputDefs()[0], + position_gather_node.MutableInputDefs()[0], + segment_gather_node.MutableInputDefs()[0], + layer_norm_node.MutableInputDefs()[1], + layer_norm_node.MutableInputDefs()[2]}; + Node& embed_layer_norm_node = graph.AddNode(graph.GenerateNodeName("EmbedLayerNormalization"), + "EmbedLayerNormalization", + "fused EmbedLayerNorm subgraphs ", + embed_layer_norm_input_defs, + {layer_norm_node.MutableOutputDefs()[0], reduce_sum_node.MutableOutputDefs()[0]}, + {}, kMSDomain); + + // Assign provider to this new node. Provider should be same as the provider for old node. + embed_layer_norm_node.SetExecutionProviderType(layer_norm_node.GetExecutionProviderType()); + + // move input edges to gather (first in list) across to the embed_layer_norm_node. + // move output definitions and output edges to embed_layer_norm_node. + // remove all the other nodes. + std::vector nodes_to_remove; + if (p_shape_node != nullptr && p_expand_node != nullptr) { + // Match Shape --> Gather --> Unsqueeze --> ConstantOfShape --> NonZero --> Transpose --> Squeeze --> Cast --> Unsqueeze --> Expand + if (p_expand_node != nullptr) { + Node& expand_node = *graph.GetNode(p_expand_node->Index()); + std::vector expand_parent_path{ + {0, 0, "Unsqueeze", {1, 11}, kOnnxDomain}, + {0, 0, "Cast", {9}, kOnnxDomain}, + {0, 0, "Squeeze", {1}, kOnnxDomain}, + {0, 0, "Transpose", {1}, kOnnxDomain}, + {0, 0, "NonZero", {9}, kOnnxDomain}, + {0, 0, "ConstantOfShape", {9}, kOnnxDomain}, + {0, 0, "Unsqueeze", {1, 11}, kOnnxDomain}, + {0, 0, "Gather", {1, 11}, kOnnxDomain}, + {0, 0, "Shape", {1}, kOnnxDomain}, + }; + if (graph_utils::FindPath(expand_node, true, expand_parent_path, edges, logger)) { + for (size_t i = 0; i < edges.size(); i++) { + if (edges[i]->GetNode().GetOutputEdgesCount() != 1) { + nodes_to_remove.clear(); + break; + } + nodes_to_remove.push_back(edges[i]->GetNode().Index()); + } + } + } + nodes_to_remove.push_back(p_shape_node->Index()); + nodes_to_remove.push_back(p_expand_node->Index()); + } + nodes_to_remove.push_back(word_gather_node.Index()); + nodes_to_remove.push_back(position_gather_node.Index()); + nodes_to_remove.push_back(segment_gather_node.Index()); + nodes_to_remove.push_back(add_node.Index()); + nodes_to_remove.push_back(reduce_sum_node.Index()); + nodes_to_remove.push_back(layer_norm_add_node.Index()); + nodes_to_remove.push_back(layer_norm_node.Index()); + + for (const auto& index : nodes_to_remove) { + Node* node = graph.GetNode(index); + graph_utils::RemoveNodeOutputEdges(graph, *node); + graph.RemoveNode(node->Index()); + } + modified = true; + } + return Status::OK(); +} +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/optimizer/embed_layer_norm_fusion.h b/onnxruntime/core/optimizer/embed_layer_norm_fusion.h new file mode 100644 index 0000000000..6814bf4bea --- /dev/null +++ b/onnxruntime/core/optimizer/embed_layer_norm_fusion.h @@ -0,0 +1,24 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/optimizer/graph_transformer.h" + +namespace onnxruntime { + +/** +@Class EmbedLayerNormFusion + +Rewrite graph fusing embeddings and mask processing into one node. + +*/ +class EmbedLayerNormFusion : public GraphTransformer { + public: + EmbedLayerNormFusion(const std::unordered_set& compatible_execution_providers = {}) noexcept + : GraphTransformer("EmbedLayerNormFusion", compatible_execution_providers) {} + + Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override; +}; + +} // namespace onnxruntime diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc index e2ae44fa18..cd3712f633 100644 --- a/onnxruntime/core/optimizer/graph_transformer_utils.cc +++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc @@ -23,6 +23,7 @@ #include "core/optimizer/gelu_approximation.h" #include "core/optimizer/layer_norm_fusion.h" #include "core/optimizer/skip_layer_norm_fusion.h" +#include "core/optimizer/embed_layer_norm_fusion.h" #include "core/optimizer/reshape_fusion.h" #include "core/optimizer/attention_fusion.h" #include "core/mlas/inc/mlas.h" @@ -128,6 +129,7 @@ std::vector> GenerateTransformers(TransformerL transformers.emplace_back(onnxruntime::make_unique(cpu_cuda_execution_providers)); transformers.emplace_back(onnxruntime::make_unique(cpu_cuda_execution_providers)); transformers.emplace_back(onnxruntime::make_unique(cpu_cuda_execution_providers)); + transformers.emplace_back(onnxruntime::make_unique(cpu_cuda_execution_providers)); transformers.emplace_back(onnxruntime::make_unique(cpu_cuda_execution_providers)); transformers.emplace_back(onnxruntime::make_unique(cpu_cuda_execution_providers)); diff --git a/onnxruntime/core/optimizer/skip_layer_norm_fusion.h b/onnxruntime/core/optimizer/skip_layer_norm_fusion.h index 99eb0b0ed1..7b634f51db 100644 --- a/onnxruntime/core/optimizer/skip_layer_norm_fusion.h +++ b/onnxruntime/core/optimizer/skip_layer_norm_fusion.h @@ -8,12 +8,9 @@ namespace onnxruntime { /** -@Class LayerNormFusion +@Class SkipLayerNormFusion -Rewrite graph fusing Layer Normalization subgraph to a single LayerNormalization node. - -The formula corresponding to LayerNorm activation subgraph: -(x - mean(x, axis)) / sqrt(var(x, axis)) * scale + bias, where x is the input. +Rewrite graph fusing Add + Layer Normalization subgraph to a single SkipLayerNormalization node. */ class SkipLayerNormFusion : public GraphTransformer { diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index 854365515c..00220740a6 100644 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -19,6 +19,7 @@ #include "core/optimizer/gelu_approximation.h" #include "core/optimizer/layer_norm_fusion.h" #include "core/optimizer/skip_layer_norm_fusion.h" +#include "core/optimizer/embed_layer_norm_fusion.h" #include "core/optimizer/graph_transformer.h" #include "core/optimizer/graph_transformer_mgr.h" #include "core/optimizer/identity_elimination.h" @@ -1253,6 +1254,48 @@ TEST(GraphTransformationTests, SkipLayerNormFusionTest) { TestSkipLayerNormFusion(MODEL_FOLDER "fusion/skip_layer_norm_format2.onnx"); TestSkipLayerNormFusion(MODEL_FOLDER "fusion/skip_layer_norm_format3.onnx"); } + +TEST(GraphTransformationTests, EmbedLayerNormFusionFormat1) { + auto model_uri = MODEL_FOLDER "fusion/embed_layer_norm_format1.onnx"; + std::shared_ptr p_model; + ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK()); + Graph& graph = p_model->MainGraph(); + + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + graph_transformation_mgr.Register(onnxruntime::make_unique(), TransformerLevel::Level2); + auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger()); + ASSERT_TRUE(ret.IsOK()); + + std::map op_to_count = CountOpsInGraph(graph); + ASSERT_TRUE(op_to_count["Gather"] == 0); + ASSERT_TRUE(op_to_count["Add"] == 0); + ASSERT_TRUE(op_to_count["ReduceSum"] == 0); + ASSERT_TRUE(op_to_count["Attention"] == 1); + ASSERT_TRUE(op_to_count["SkipLayerNormalization"] == 0); + ASSERT_TRUE(op_to_count["EmbedLayerNormalization"] == 1); +} + +TEST(GraphTransformationTests, EmbedLayerNormFusionFormat2) { + auto model_uri = MODEL_FOLDER "fusion/embed_layer_norm_format2.onnx"; + std::shared_ptr p_model; + ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK()); + Graph& graph = p_model->MainGraph(); + + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + graph_transformation_mgr.Register(onnxruntime::make_unique(), TransformerLevel::Level2); + auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger()); + ASSERT_TRUE(ret.IsOK()); + + std::map op_to_count = CountOpsInGraph(graph); + ASSERT_TRUE(op_to_count["Shape"] == 0); + ASSERT_TRUE(op_to_count["Expand"] == 0); + ASSERT_TRUE(op_to_count["Gather"] == 0); + ASSERT_TRUE(op_to_count["Add"] == 0); + ASSERT_TRUE(op_to_count["ReduceSum"] == 0); + ASSERT_TRUE(op_to_count["Attention"] == 1); + ASSERT_TRUE(op_to_count["SkipLayerNormalization"] == 0); + ASSERT_TRUE(op_to_count["EmbedLayerNormalization"] == 1); +} #endif } // namespace test diff --git a/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_format1.onnx b/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_format1.onnx new file mode 100644 index 0000000000000000000000000000000000000000..413c52bdb280d5976c5b3624353ce017a2129e4d GIT binary patch literal 940 zcmd;J6%x)bD$y+|O3W)x%P-1JEXmBzE4J!m=ThckD^JZ#&nV$yj1b~WPb|quEi#PH zFD)r3Em2}~2lKHiGD25m#D%6E#)qjC;!aFSft$_jn395Qa3sujV|0V{xsWx&Oob@} zn+mf+32dq_SOZLqi?uj8F(;LaB`GtpSco$x4{E-YkWXS|YLOo(IC3(pz(JyfMT|>@ zizPR)I9o`tC^e-tIW@kxG#BPX&LFT*aA_`AJ0lmuY%USUl9JRsP$OgwaN6*)uyQtl_I=2X=`{?^${KURM!& z-@*d><;PO(?=;@GHI4|iZ(w4zKX`SDT}Xb5trM2Gh!O^80xlj7HX#l!CJrVbW(ElY n(LK_d^WgArj&mNK@OMrdaQ+QsO^B)|m#>ir&` literal 0 HcmV?d00001 diff --git a/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_format2.onnx b/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_format2.onnx new file mode 100644 index 0000000000000000000000000000000000000000..58e4ee64e2f3f708d1770471c5757b7a6913cf23 GIT binary patch literal 1060 zcmd;J6%x)bD$y+|O3W)x%P-1JEXmBzE4FIm;8NydD^JZ#&nV$yj1b~WPb|quEi#PH zFD)r3Em2}~2lKfkK$1Mg8Hoj{Fe%nx5Rc1*i#N5RATci`9%>4TDj}F8%sf}HS}uLK zrKk#E4lsh*0&{>77n;*xe3(ih?!=T7xVxDhQ&Nz976~)h7~NofE@X``Q(?-$royaH z0-Nd!)&LXZVl7Ti%t_^9NyYU6faBwSO5#v(fV#!S` z&K43ZN=+$EPK_@v&4oFUGYBjcT$;<(&d7x@n@hy8q$D*D6oc`F*=4YR7eW;UIfx6> zR3SDq9MsdOAw~k!NCa< zhFSthLg4V!>SE+z=HLXHnbVxP->!SHrOo7t&32u4Jnd!p!tF)(UbFMQR%>6+>uA61 zw66V(BYgH!S%&ruy1BMS|BdW;zb4u9KYeU7rI6F!OV-=&;RSPhx9D2?Idv}f-?Ap! z#O`sm&)w%`uVSBNx7eB4?w+B(z3UHmdnc^nt7QjviOThPEcTr{j@Vf(UvIl;g|z*( z_%C*sW~te;hlkrwWjbh=8>?^cge5Mbguxk%i-&_vh=YrXg9(V4L4v@vi6$rpO%rJ1 sNfKPp$OG$OL>QAL#f6&fxVSht*o9a?)?_JTO9cp{8G(uzotOl;0Ei()4FCWD literal 0 HcmV?d00001