Merge branch 'master' into windowsai

This commit is contained in:
Brian Martin 2019-11-29 07:50:17 -08:00
commit 5adab88eed
76 changed files with 1677 additions and 338 deletions

View file

@ -347,6 +347,11 @@ if (onnxruntime_USE_ACL)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES arm_compute acl arm_compute_graph arm_compute_core)
endif()
# MKLML
if (onnxruntime_USE_MKLDNN OR onnxruntime_USE_MKLML)
include(mkldnn)
endif()
# TVM
if (onnxruntime_USE_TVM)
if (onnxruntime_USE_CUDA)
@ -356,6 +361,19 @@ if (onnxruntime_USE_TVM)
set(USE_LLVM ON)
add_definitions(-DUSE_TVM_WITH_LLVM)
endif()
if (onnxruntime_USE_OPENMP)
set(USE_OPENMP "gnu")
endif()
if (onnxruntime_USE_MKLML)
set(USE_OPENMP "intel")
# make sure MKLML in ORT is used by TVM
if (WIN32)
set(OMP_LIBRARY ${MKLML_LIB_DIR}/${IOMP5MD_IMPORT_LIB})
else()
set(OMP_LIBRARY ${MKLML_LIB_DIR}/${IOMP5MD_SHARED_LIB})
endif()
endif()
add_subdirectory(${PROJECT_SOURCE_DIR}/external/tvm EXCLUDE_FROM_ALL)
set_target_properties(tvm PROPERTIES FOLDER "External/tvm")
set_target_properties(tvm_topi PROPERTIES FOLDER "External/tvm")
@ -505,10 +523,6 @@ include_directories(
${REPO_ROOT}/include/onnxruntime/core/session
)
if (onnxruntime_USE_MKLDNN OR onnxruntime_USE_MKLML)
include(mkldnn)
endif()
if(onnxruntime_USE_GEMMLOWP)
add_definitions(-DUSE_GEMMLOWP=1)
endif()

2
cmake/external/tvm vendored

@ -1 +1 @@
Subproject commit 9b3a424a91d6003db1993cdd7121e46696f220e8
Subproject commit c6e3efcdb09aeda961a6badf76093ceac69db64d

View file

@ -17,6 +17,12 @@ You can use the Nuphar execution provider via the python wheel from the ONNX Run
## Performance and Accuracy Testing
You can test your ONNX model's performance with [onnxruntime_perf_test](../../onnxruntime/test/perftest/README.md), or test accuracy with [onnx_test_runner](../../onnxruntime/test/onnx/README.txt). To run these tools with the Nuphar execution provider, please pass `-e nuphar` in command line options.
Please note that Nuphar uses TVM thread pool and parallel schedule for multi-thread inference performance. When building with OpenMP or MKLML, TVM thread pool would use gomp or iomp as its implementation; otherwise, TVM creates its own thread pool. Because of this, the current default parallel schedule policy is:
- Default to on for USE_OPENMP or USE_MKLML. User can use OMP_NUM_THREADS/MKL_NUM_THREADS to control TVM thread pool, as well as TVM_NUM_THREADS
- Default to off for none of above. User can use TVM_NUM_THREADS to control TVM thread pool.
This choice is to ensure to get ideal performance with the different build options. When build with USE_OPENMP or USE_MKLML, users would have to avoid thread confliction from OpenMP or MKL with their inference invocations anyway, so parallel schedule is enable to leverage existing thread pool. When not building with gomp or iomp, TVM thread pool is turned off to avoid confliction with user threads. If needed, user can set env or settings with [NUPHAR_PARALLEL_MIN_WORKLOADS](../../onnxruntime/core/providers/nuphar/common/nuphar_settings.cc#L61) to 0 to disable parallel schedule, or to some non-zero value to enable parallel schedule. The non-zero value indicates the minimal number of elements being computed per thread when parallel schedule would be turned on.
## Model Conversion and Quantization
You may use Python script [model_editor.py](../../onnxruntime/core/providers/nuphar/scripts/model_editor.py) to turn LSTM/GRU/RNN ops to Scan ops for a given model, and then use [model_quantizer.py](../../onnxruntime/core/providers/nuphar/scripts/model_quantizer.py) to quantize MatMul ops into MatMulInteger ops.

View file

@ -12,9 +12,8 @@ enum class TransformerLevel : int {
Level1,
Level2,
Level3,
// Convenience enum to always get the max available value.
// This way when we add more levels code which iterates over this enum does not need to change.
MaxTransformerLevel
// The max level should always be same as the last level.
MaxLevel = Level3
};
} // namespace onnxruntime

View file

@ -46,6 +46,11 @@ class ThreadPool {
*/
void ParallelFor(int32_t total, std::function<void(int32_t)> fn);
/*
Schedule work in the interval [0, total), with calls split into (num_batches) batches.
*/
void BatchParallelFor(int32_t total, std::function<void(int32_t)> fn, int32_t num_batches = 0);
/*
Schedule work in the interval [first, last].
*/
@ -54,6 +59,43 @@ class ThreadPool {
// This is not supported until the latest Eigen
// void SetStealPartitions(const std::vector<std::pair<unsigned, unsigned>>& partitions);
/**
Tries to call the given function in parallel, with calls split into (num_batches) batches.
**/
template <typename F>
inline static void TryBatchParallelFor(concurrency::ThreadPool* tp, int32_t total, F&& fn, int32_t num_batches = 0) {
if (tp != nullptr) {
if (num_batches <= 0) {
num_batches = tp->NumThreads() + 1;
}
tp->BatchParallelFor(total, std::forward<F>(fn), num_batches);
} else {
#ifdef USE_OPENMP
#pragma omp parallel for
#endif
for (int32_t i = 0; i < total; ++i) {
fn(i);
}
}
}
/**
Tries to call the given function in parallel.
**/
template <typename F>
inline static void TryParallelFor(concurrency::ThreadPool* tp, int32_t total, F&& fn) {
if (tp != nullptr) {
tp->ParallelFor(total, std::forward<F>(fn));
} else {
#ifdef USE_OPENMP
#pragma omp parallel for
#endif
for (int32_t i = 0; i < total; ++i) {
fn(i);
}
}
}
int NumThreads() const;
int CurrentThreadId() const;

View file

@ -0,0 +1,76 @@
#include "bias_gelu_fusion.h"
#include "core/util/math_cpuonly.h"
#include "core/platform/threadpool.h"
#include "core/mlas/inc/mlas.h"
namespace onnxruntime {
namespace contrib {
ONNX_OPERATOR_KERNEL_EX(
BiasGelu,
kMSDomain,
1,
kCpuExecutionProvider,
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
BiasGelu<float>);
template <typename T>
Status BiasGelu<T>::Compute(OpKernelContext* ctx) const {
const Tensor* X = ctx->Input<Tensor>(0);
const auto input_dims = X->Shape().GetDims();
if (input_dims.size() < 1) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"Shape of Input 0 is expected to have at least 1 dimension, got ", input_dims.size());
}
const Tensor* B = ctx->Input<Tensor>(1);
const auto bias_dims = B->Shape().GetDims();
if (bias_dims.size() != 1) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"Input 1 is expected to have 1 dimension, got ", bias_dims.size());
}
int64_t bias_len = bias_dims[0];
if (bias_len != input_dims[input_dims.size() - 1]) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"dimension 0 of Input 1 should have same length as the last dimension of input 0");
}
Tensor* Y = ctx->Output(0, X->Shape());
AllocatorPtr alloc;
ORT_RETURN_IF_ERROR(ctx->GetTempSpaceAllocator(&alloc));
BufferUniquePtr temp_data_buf_ptr = BufferUniquePtr(alloc->Alloc(sizeof(T) * X->Shape().Size()), BufferDeleter(alloc));
T* tmp_data = static_cast<T*>(temp_data_buf_ptr.get());
const T* X_data = X->template Data<T>();
const T* B_data = B->template Data<T>();
T* Y_data = Y->template MutableData<T>();
int64_t task_count = X->Shape().Size() / bias_len;
concurrency::ThreadPool::TryBatchParallelFor(ctx->GetOperatorThreadPool(),
static_cast<int32_t>(task_count),
[&](int32_t task_idx) {
const T* p_input = X_data + task_idx * bias_len;
T* p_output = Y_data + task_idx * bias_len;
T* p_output_tmp = tmp_data + task_idx * bias_len;
for (int64_t h = 0; h < bias_len; h++) {
T value = p_input[h] + B_data[h];
p_output[h] = value * static_cast<T>(M_SQRT1_2);
p_output_tmp[h] = value * 0.5f;
}
MlasComputeErf(p_output, p_output, bias_len);
for (int64_t h = 0; h < bias_len; h++) {
p_output[h] = p_output_tmp[h] * (p_output[h] + 1.0f);
}
});
return Status::OK();
}
} // namespace contrib
} // namespace onnxruntime

View file

@ -0,0 +1,20 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/common.h"
#include "core/framework/op_kernel.h"
namespace onnxruntime {
namespace contrib {
template <typename T>
class BiasGelu : public OpKernel {
public:
explicit BiasGelu(const OpKernelInfo& info) : OpKernel(info) {}
Status Compute(OpKernelContext* context) const override;
};
} // namespace contrib
} // namespace onnxruntime

View file

@ -4,6 +4,9 @@
#include "embed_layer_norm.h"
#include "embed_layer_norm_helper.h"
#include "core/util/math_cpuonly.h"
#include "core/platform/threadpool.h"
#include <atomic>
namespace onnxruntime {
namespace contrib {
@ -60,40 +63,63 @@ Status EmbedLayerNorm<T>::Compute(OpKernelContext* context) const {
int position_embedding_length = static_cast<int>(position_embedding->Shape()[0]);
int segment_embedding_length = static_cast<int>(segment_embedding->Shape()[0]);
ConstEigenArrayMap<T> word_embedding_arr(word_embedding->template Data<T>(), hidden_size, word_embedding_length);
ConstEigenArrayMap<T> position_embedding_arr(position_embedding->template Data<T>(), hidden_size, position_embedding_length);
ConstEigenArrayMap<T> segment_embedding_arr(segment_embedding->template Data<T>(), hidden_size, segment_embedding_length);
ConstEigenVectorMap<T> gamma_vector(gamma->template Data<T>(), hidden_size);
ConstEigenVectorMap<T> beta_vector(beta->template Data<T>(), hidden_size);
EigenArrayMap<T> output_arr(output->template MutableData<T>(), hidden_size, batch_size * sequence_length);
auto input_ids_data = input_ids->template Data<int>();
auto segment_ids_data = segment_ids->template Data<int>();
auto word_embedding_data = word_embedding->template Data<T>();
auto position_embedding_data = position_embedding->template Data<T>();
auto segment_embedding_data = segment_embedding->template Data<T>();
auto gamma_data = gamma->template Data<T>();
auto beta_data = beta->template Data<T>();
auto output_data = output->template MutableData<T>();
// Calculate output
{
size_t index = 0;
for (int b = 0; b < batch_size; b++) {
for (int s = 0; s < sequence_length; s++) {
int word_col_index = input_ids->template Data<int>()[index];
if (word_col_index < 0 || word_col_index >= word_embedding_length) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "word_col_index out of range");
}
int position_col_index = s;
if (position_col_index >= position_embedding_length) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "position_col_index out of range");
}
int segment_col_index = segment_ids->template Data<int>()[index];
if (segment_col_index < 0 || segment_col_index >= segment_embedding_length) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "segment_col_index out of range");
}
std::atomic_bool failed{false};
output_arr.col(index) = word_embedding_arr.col(word_col_index) +
position_embedding_arr.col(position_col_index) +
segment_embedding_arr.col(segment_col_index);
output_arr.col(index) -= output_arr.col(index).mean();
output_arr.col(index) /= static_cast<T>(sqrt(output_arr.col(index).pow(2).mean() + 1.0e-13));
output_arr.col(index) *= gamma_vector.array();
output_arr.col(index) += beta_vector.array();
index++;
int n = batch_size * sequence_length;
concurrency::ThreadPool::TryBatchParallelFor(context->GetOperatorThreadPool(), n, [=, &failed](int index) {
int word_col_index = input_ids_data[index];
if (word_col_index < 0 || word_col_index >= word_embedding_length) {
failed.store(true, std::memory_order_release);
return;
}
int position_col_index = index % sequence_length;
if (position_col_index >= position_embedding_length) {
failed.store(true, std::memory_order_release);
return;
}
int segment_col_index = segment_ids_data[index];
if (segment_col_index < 0 || segment_col_index >= segment_embedding_length) {
failed.store(true, std::memory_order_release);
return;
}
T* y = output_data + index * hidden_size;
const T* input_word_embedding = word_embedding_data + word_col_index * hidden_size;
const T* input_position_embedding = position_embedding_data + position_col_index * hidden_size;
const T* input_segment_embedding = segment_embedding_data + segment_col_index * hidden_size;
T sum = static_cast<T>(0);
for (int i = 0; i < hidden_size; i++) {
T subtotal = input_word_embedding[i] + input_position_embedding[i] + input_segment_embedding[i];
y[i] = subtotal;
sum += subtotal;
}
T mean = sum / hidden_size;
sum = 0;
for (int i = 0; i < hidden_size; i++) {
T a = y[i] - mean;
y[i] = a;
sum += a * a;
}
T e = sqrt(sum / hidden_size + static_cast<T>(1.0e-13));
for (int i = 0; i < hidden_size; i++) {
y[i] = y[i] / e * gamma_data[i] + beta_data[i];
}
});
if (failed.load(std::memory_order_acquire)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "input index out of range");
}
}

View file

@ -43,17 +43,6 @@ namespace contrib {
ADD_TYPED_CROPANDRESIZE_OP(float);
template <typename T>
static void TryParallelFor(concurrency::ThreadPool* tp, int32_t total, T&& fn) {
if (tp != nullptr)
tp->ParallelFor(total, fn);
else {
for (int32_t i = 0; i != total; ++i) {
fn(i);
}
}
}
template <typename T>
void CropAndResizeForward(const TensorShape& output_shape,
const T* bottom_data,
@ -71,9 +60,7 @@ void CropAndResizeForward(const TensorShape& output_shape,
int64_t pooled_height = output_shape[2];
int64_t pooled_width = output_shape[3];
// TODO: This should do blocks of work based on the number of threads in the threadpool with each block
// being n_rois / num_threads
std::function<void(int32_t)> work_object = [&](int32_t n) {
ThreadPool::TryBatchParallelFor(ttp, static_cast<int32_t>(n_rois), [&](int32_t n) {
int64_t index_n = n * channels * pooled_width * pooled_height;
const T* offset_bottom_rois = bottom_rois + n * num_roi_cols;
@ -182,9 +169,7 @@ void CropAndResizeForward(const TensorShape& output_shape,
}
} // for pw
} // for ph
}; // for n
TryParallelFor(ttp, static_cast<int32_t>(n_rois), work_object);
}); // for n
}
template <typename T>

View file

@ -1,11 +1,13 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/framework/tensor.h"
#include "core/util/math_cpuonly.h"
#include "core/providers/common.h"
#include "layer_norm.h"
#include "core/framework/tensor.h"
#include "core/platform/threadpool.h"
#include "core/providers/common.h"
#include "core/util/math_cpuonly.h"
namespace onnxruntime {
namespace contrib {
@ -31,19 +33,22 @@ LayerNorm<T>::LayerNorm(const OpKernelInfo& op_kernel_info)
}
template <typename T>
Status LayerNorm<T>::Compute(OpKernelContext* p_op_kernel_context) const {
Status LayerNorm<T>::Compute(OpKernelContext* p_ctx) const {
// Inputs
const Tensor* X = p_op_kernel_context->Input<Tensor>(0);
const Tensor* scale = p_op_kernel_context->Input<Tensor>(1);
const Tensor* bias = p_op_kernel_context->Input<Tensor>(2);
const Tensor* X = p_ctx->Input<Tensor>(0);
const Tensor* scale = p_ctx->Input<Tensor>(1);
const Tensor* bias = p_ctx->Input<Tensor>(2);
auto X_data = X->template Data<T>();
auto scale_data = scale->template Data<T>();
auto bias_data = bias->template Data<T>();
const TensorShape& x_shape = X->Shape();
const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions());
auto N = x_shape.SizeToDimension(axis);
auto M = x_shape.SizeFromDimension(axis);
auto norm_count = x_shape.SizeToDimension(axis);
auto norm_size = x_shape.SizeFromDimension(axis);
Tensor* Y = p_ctx->Output(0, x_shape);
auto Y_data = Y->template MutableData<T>();
std::vector<int64_t> mean_inv_std_var_dim;
mean_inv_std_var_dim.reserve(x_shape.NumDimensions());
@ -56,16 +61,16 @@ Status LayerNorm<T>::Compute(OpKernelContext* p_op_kernel_context) const {
}
AllocatorPtr alloc;
ORT_RETURN_IF_ERROR(p_op_kernel_context->GetTempSpaceAllocator(&alloc));
ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc));
T* mean_data = nullptr;
BufferUniquePtr mean_data_buf_ptr;
Tensor* mean = p_op_kernel_context->Output(1, TensorShape(mean_inv_std_var_dim));
Tensor* mean = p_ctx->Output(1, TensorShape(mean_inv_std_var_dim));
if (mean != nullptr) {
mean_data = mean->template MutableData<T>();
} else {
auto mean_data_buf = alloc->Alloc(sizeof(T) * N);
auto mean_data_buf = alloc->Alloc(sizeof(T) * norm_count);
mean_data_buf_ptr = BufferUniquePtr(mean_data_buf, BufferDeleter(alloc));
mean_data = static_cast<T*>(mean_data_buf_ptr.get());
}
@ -73,38 +78,39 @@ Status LayerNorm<T>::Compute(OpKernelContext* p_op_kernel_context) const {
T* inv_std_var_data = nullptr;
BufferUniquePtr inv_std_var_data_buf_ptr;
Tensor* inv_std_var = p_op_kernel_context->Output(2, TensorShape(mean_inv_std_var_dim));
Tensor* inv_std_var = p_ctx->Output(2, TensorShape(mean_inv_std_var_dim));
if (inv_std_var != nullptr) {
inv_std_var_data = inv_std_var->template MutableData<T>();
} else {
auto inv_std_var_data_buf = alloc->Alloc(sizeof(T) * N);
auto inv_std_var_data_buf = alloc->Alloc(sizeof(T) * norm_count);
inv_std_var_data_buf_ptr = BufferUniquePtr(inv_std_var_data_buf, BufferDeleter(alloc));
inv_std_var_data = static_cast<T*>(inv_std_var_data_buf_ptr.get());
}
std::memset(mean_data, 0, sizeof(T) * N);
std::memset(inv_std_var_data, 0, sizeof(T) * N);
concurrency::ThreadPool::TryBatchParallelFor(p_ctx->GetOperatorThreadPool(),
static_cast<int32_t>(norm_count),
[&](int32_t task_idx) {
const T* p_input = X_data + task_idx * norm_size;
T* p_output = Y_data + task_idx * norm_size;
ConstEigenArrayMap<T> X_arr(X_data, M, N);
for (int i = 0; i < N; ++i) {
mean_data[i] = X_arr.col(i).mean();
inv_std_var_data[i] = X_arr.col(i).square().mean() - mean_data[i] * mean_data[i];
}
T mean = 0;
T mean_square = 0;
// Compute Y = ((x - mean) * (inv_var) * scale + bias
Tensor* Y = p_op_kernel_context->Output(0, x_shape);
auto Y_data = Y->template MutableData<T>();
EigenArrayMap<T> Y_arr(Y_data, M, N);
for (int64_t h = 0; h < norm_size; h++) {
mean += p_input[h];
mean_square += p_input[h] * p_input[h];
}
ConstEigenVectorArrayMap<T> mean_arr(mean_data, N);
EigenVectorArrayMap<T> inv_std_var_arr(inv_std_var_data, N);
inv_std_var_arr = (inv_std_var_arr + epsilon_).sqrt().inverse();
mean = mean / norm_size;
mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon_);
Y_arr = (X_arr.rowwise() - mean_arr.transpose()).rowwise() * inv_std_var_arr.transpose();
for (int64_t h = 0; h < norm_size; h++) {
p_output[h] = (p_input[h] - mean) / mean_square * scale_data[h] + bias_data[h];
}
ConstEigenVectorArrayMap<T> scale_arr(scale_data, M);
ConstEigenVectorArrayMap<T> bias_arr(bias_data, M);
Y_arr = (Y_arr.colwise() * scale_arr).colwise() + bias_arr;
mean_data[task_idx] = mean;
inv_std_var_data[task_idx] = mean_square;
});
return Status::OK();
}

View file

@ -0,0 +1,129 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/framework/tensor.h"
#include "core/util/math_cpuonly.h"
#include "core/providers/common.h"
#include "core/platform/threadpool.h"
#include "skip_layer_norm.h"
namespace onnxruntime {
namespace contrib {
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
SkipLayerNormalization, \
kMSDomain, \
1, \
T, \
kCpuExecutionProvider, \
KernelDefBuilder() \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
SkipLayerNorm<T>);
REGISTER_KERNEL_TYPED(float)
REGISTER_KERNEL_TYPED(double)
template <typename T>
SkipLayerNorm<T>::SkipLayerNorm(const OpKernelInfo& op_kernel_info)
: OpKernel(op_kernel_info) {
}
template <typename T>
Status SkipLayerNorm<T>::Compute(OpKernelContext* p_ctx) const {
const Tensor* input = p_ctx->Input<Tensor>(0);
const Tensor* skip = p_ctx->Input<Tensor>(1);
const Tensor* gamma = p_ctx->Input<Tensor>(2);
const Tensor* beta = p_ctx->Input<Tensor>(3);
const Tensor* bias = p_ctx->Input<Tensor>(4);
Tensor* output = p_ctx->Output(0, input->Shape());
const auto input_dims = input->Shape().GetDims();
if (input_dims.size() != 3) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"input is expected to have 3 dimensions, got ", input_dims.size());
}
if (input->Shape() != skip->Shape()) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"skip is expected to have same shape as input");
}
const auto gamma_dims = gamma->Shape().GetDims();
if (gamma_dims.size() != 1) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"gamma is expected to have 1 dimension, got ", gamma_dims.size());
}
if (gamma_dims[0] != input_dims[2]) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"Last dimension of gamma and input does not match");
}
const auto beta_dims = beta->Shape().GetDims();
if (beta_dims.size() != 1) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"beta is expected to have 1 dimension, got ", beta_dims.size());
}
if (beta_dims[0] != input_dims[2]) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"Last dimension of beta and input does not match");
}
if (nullptr != bias) {
const auto bias_dims = bias->Shape().GetDims();
if (bias_dims.size() != 1) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"bias is expected to have 1 dimension, got ", bias_dims.size());
}
if (bias_dims[0] != input_dims[2]) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"Last dimension of bias and input does not match");
}
}
int64_t batch_size = input_dims[0];
int64_t sequence_length = input_dims[1];
int64_t hidden_size = input_dims[2];
int64_t task_count = batch_size * sequence_length;
const T* input_data = input->Data<T>();
const T* skip_data = skip->Data<T>();
const T* gamma_data = gamma->Data<T>();
const T* beta_data = beta->Data<T>();
const T* bias_data = bias == nullptr ? nullptr : bias->Data<T>();
T* output_data = output->MutableData<T>();
concurrency::ThreadPool::TryBatchParallelFor(p_ctx->GetOperatorThreadPool(),
static_cast<int32_t>(task_count),
[&](int32_t task_idx) {
const T* p_input = input_data + task_idx * hidden_size;
const T* p_skip = skip_data + task_idx * hidden_size;
T* p_output = output_data + task_idx * hidden_size;
T mean = 0;
T mean_square = 0;
for (int64_t h = 0; h < hidden_size; h++) {
T value = p_input[h] + p_skip[h];
if (nullptr != bias_data) {
value += bias_data[h];
}
p_output[h] = value;
mean += value;
mean_square += value * value;
}
mean = mean / hidden_size;
mean_square = sqrt(mean_square / hidden_size - mean * mean + float(1e-12));
for (int64_t h = 0; h < hidden_size; h++) {
p_output[h] = (p_output[h] - mean) / mean_square * gamma_data[h] + beta_data[h];
}
});
return Status::OK();
} // namespace contrib
} // namespace contrib
} // namespace onnxruntime

View file

@ -0,0 +1,21 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/common.h"
#include "core/framework/op_kernel.h"
#include "core/framework/tensor.h"
namespace onnxruntime {
namespace contrib {
template <typename T>
class SkipLayerNorm final : public OpKernel {
public:
SkipLayerNorm(const OpKernelInfo& op_kernel_info);
Status Compute(OpKernelContext* p_op_kernel_context) const override;
};
} // namespace contrib
} // namespace onnxruntime

View file

@ -33,6 +33,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, CDist);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, CDist);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Gelu);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, BiasGelu);
// This section includes all op kernel declarations for former experimental ops which have now been removed from onnx.
// To maintain backward compatibility these are added as contrib ops.
@ -68,6 +69,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSNchwcDomai
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSNchwcDomain, 1, float, GlobalAveragePool);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float, LayerNormalization);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double, LayerNormalization);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SkipLayerNormalization);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipLayerNormalization);
Status RegisterNchwcKernels(KernelRegistry& kernel_registry) {
static const BuildKernelCreateInfoFn function_table[] = {
@ -112,6 +115,7 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, CDist)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, CDist)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, BiasGelu)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Gelu)>,
// These ops were experimental ops in onnx domain which have been removed now. We add them here as
@ -138,7 +142,9 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, ThresholdedRelu)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Scale)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float, LayerNormalization)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double, LayerNormalization)>};
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double, LayerNormalization)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SkipLayerNormalization)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipLayerNormalization)>};
for (auto& function_table_entry : function_table) {
ORT_RETURN_IF_ERROR(kernel_registry.Register(function_table_entry()));

View file

@ -63,7 +63,7 @@ namespace cuda {
CONTRIB_BINARY_OP_TYPED(name, ver, float) \
CONTRIB_BINARY_OP_TYPED(name, ver, double)
CONTRIB_BINARY_OP_HFD(AddGeluFusion, 1)
CONTRIB_BINARY_OP_HFD(BiasGelu, 1)
} // namespace cuda
} // namespace contrib

View file

@ -16,9 +16,9 @@ namespace cuda {
// AddGelu fuse Add + Gelu
template <typename T>
class AddGeluFusion final : public BinaryElementwise<ShouldBroadcast> {
class BiasGelu final : public BinaryElementwise<ShouldBroadcast> {
public:
AddGeluFusion(const OpKernelInfo& info) : BinaryElementwise(info) {
BiasGelu(const OpKernelInfo& info) : BinaryElementwise(info) {
}
Status ComputeInternal(OpKernelContext* context) const override;

View file

@ -87,7 +87,7 @@ CONTRIB_BINARY_OPS()
// D: double
// O: bool
CONTRIB_SPECIALIZED_BINARY_ELEMENTWISE_IMPL_HFD(AddGeluFusion)
CONTRIB_SPECIALIZED_BINARY_ELEMENTWISE_IMPL_HFD(BiasGelu)
} // namespace cuda
} // namespace contrib

View file

@ -13,7 +13,7 @@ namespace cuda {
// 4. Add op kernel class definition in binary_elementwise_ops.h
// 5. Add op kernel registration and compute specialization in binary_elementwise_ops.cc
#define CONTRIB_BINARY_OPS() \
CONTRIB_BINARY_OP_NAME_EXPR(AddGeluFusion, _Gelu(a + b))
CONTRIB_BINARY_OP_NAME_EXPR(BiasGelu, _Gelu(a + b))
// NOTE that cu files are compiled with nvcc and should not refer to any onnxruntime headers
// so struct BinaryElementwisePreparation cannot be used here

View file

@ -14,9 +14,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, Gelu);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, Gelu);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, Gelu);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, AddGeluFusion);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, AddGeluFusion);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, AddGeluFusion);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, BiasGelu);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, BiasGelu);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, BiasGelu);
// These ops were experimental ops in onnx domain which have been removed now. We add them here as
// contrib ops to maintain backward compatibility
@ -58,9 +58,9 @@ void RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, Gelu)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, Gelu)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, Gelu)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, AddGeluFusion)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, AddGeluFusion)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, AddGeluFusion)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, BiasGelu)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, BiasGelu)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, BiasGelu)>,
// These ops were experimental ops in onnx domain which have been removed now. We add them here as
// contrib ops to maintain backward compatibility

View file

@ -58,6 +58,19 @@ bool InsertRootScheduleAndClosure(
return true;
}
// Check precondition for vectorize schedule
bool ShouldTryVectorization(
const tvm::Tensor& tensor,
ScheduleContext& ctx) {
auto it = ctx.scheduled_tensors.find(tensor->op.get());
if (it != ctx.scheduled_tensors.end()) {
if (it->second > ScheduleType::ScheduleInline) {
return false;
}
}
return true;
}
// Check the schedule of tensor
// If it is not scheduled, try to vectorize it.
// Note TryVectorization has to use with compute_root.
@ -66,12 +79,8 @@ bool TryVectorization(
const tvm::Tensor& tensor,
int64_t natural_vector_size,
ScheduleContext& ctx) {
auto it = ctx.scheduled_tensors.find(tensor->op.get());
if (it != ctx.scheduled_tensors.end()) {
if (it->second > ScheduleType::ScheduleInline) {
return false;
}
}
if (!ShouldTryVectorization(tensor, ctx))
return false;
auto shape = tensor->shape;
auto rank = shape.size();

View file

@ -26,6 +26,11 @@ bool InsertRootScheduleAndClosure(
const tvm::Tensor& tensor,
ScheduleContext& ctx);
// Check precondition for vectorize schedule
bool ShouldTryVectorization(
const tvm::Tensor& tensor,
ScheduleContext& ctx);
// Check the schedule of tensor
// If it is not scheduled, try to vectorize it.
// Note TryVectorization has to use with compute_root.

View file

@ -57,6 +57,36 @@ void ThreadPool::ParallelFor(int32_t total, std::function<void(int32_t)> fn) {
barrier.Wait();
}
void ThreadPool::BatchParallelFor(int32_t total, std::function<void(int32_t)> fn, int32_t num_batches) {
if (total <= 0)
return;
if (total == 1) {
fn(0);
return;
}
if (num_batches <= 1) {
for (int i = 0; i < total; i++) {
fn(i);
}
return;
}
if (num_batches >= total) {
ParallelFor(total, fn);
return;
}
ParallelFor(num_batches, [&](int batch_index) {
int start = batch_index * total / num_batches;
int end = (batch_index + 1) * total / num_batches;
for (int i = start; i < end; i++) {
fn(i);
}
});
}
void ThreadPool::ParallelForRange(int64_t first, int64_t last, std::function<void(int64_t, int64_t)> fn) {
if (last <= first) return;
if (last - first == 1) {

View file

@ -55,6 +55,7 @@ struct SessionOptions {
TransformerLevel graph_optimization_level = TransformerLevel::Level1;
// controls the size of the thread pool used to parallelize the execution of tasks within individual nodes (ops)
// if OpenMP is enabled, this configuration will be ignored
int intra_op_num_threads = 0;
// controls the size of the thread pool used to parallelize the execution of nodes (ops)

View file

@ -1945,28 +1945,22 @@ inputs by their magnitude, rather than gates inputs by their sign as in ReLUs.)D
"Constrain input and output types to float tensors.")
.TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput);
ONNX_CONTRIB_OPERATOR_SCHEMA(AddGeluFusion)
static const char* BiasGelu_ver1_doc =
R"DOC(Bias Gelu.
It's an extension of Gelu. It takes the sum of input A and bias input B as the input of Gelu activation. )DOC";
ONNX_CONTRIB_OPERATOR_SCHEMA(BiasGelu)
.SetDomain(kMSDomain)
.SinceVersion(1)
.SetSupportLevel(OpSchema::SupportType::EXPERIMENTAL)
.SetDoc("AddGeluFusion fuses Add+Gelu. The fused Add op is the parent node of the fused Gelu.")
.Input(0, "A", "The input data as Tensor that is the first input of fused Add.", "T")
.Input(1, "B", "The input data as Tensor that is the second input of fused Add.", "T")
.SetDoc(BiasGelu_ver1_doc)
.Input(0, "A", "The normal input data.", "T")
.Input(1, "B", "The bias input data that is a 1D tensor.", "T")
.Output(0, "C", "The output.", "T")
.TypeConstraint(
"T",
{"tensor(float16)", "tensor(float)", "tensor(double)"},
"Constrain input and output types to float tensors.")
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
propagateElemTypeFromInputToOutput(ctx, 0, 0);
if (hasNInputShapes(ctx, 2)) {
bidirectionalBroadcastShapeInference(
ctx.getInputType(0)->tensor_type().shape(),
ctx.getInputType(1)->tensor_type().shape(),
*ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape());
}
});
.TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput);
RegisterBertSchemas();

View file

@ -2,7 +2,7 @@
// Licensed under the MIT License.
#include "core/optimizer/initializer.h"
#include "core/optimizer/add_gelu_fusion.h"
#include "core/optimizer/bias_gelu_fusion.h"
#include "core/graph/graph_utils.h"
#include <deque>
@ -10,7 +10,7 @@ using namespace ONNX_NAMESPACE;
using namespace ::onnxruntime::common;
namespace onnxruntime {
Status AddGeluFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
Status BiasGelu::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
GraphViewer graph_viewer(graph);
const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
@ -29,6 +29,21 @@ Status AddGeluFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, c
continue;
}
std::vector<NodeArg*> gelu_input;
const TensorShapeProto* add_input1_shape = node.MutableInputDefs()[0]->Shape();
const TensorShapeProto* add_input2_shape = node.MutableInputDefs()[1]->Shape();
if (add_input1_shape != nullptr &&
add_input1_shape->dim_size() == 1) {
gelu_input.push_back(node.MutableInputDefs()[1]);
gelu_input.push_back(node.MutableInputDefs()[0]);
} else if (add_input2_shape != nullptr &&
add_input2_shape->dim_size() == 1) {
gelu_input.push_back(node.MutableInputDefs()[0]);
gelu_input.push_back(node.MutableInputDefs()[1]);
} else {
continue;
}
auto next_node_itr = node.OutputNodesBegin();
if (next_node_itr == node.OutputNodesEnd()) {
continue;
@ -47,10 +62,10 @@ Status AddGeluFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, c
Node& add_node = node;
Node& gelu_node = const_cast<Node&>(next_node);
Node& gelu_add_fusion_node = graph.AddNode(graph.GenerateNodeName("AddGeluFusion"),
"AddGeluFusion",
Node& gelu_add_fusion_node = graph.AddNode(graph.GenerateNodeName("BiasGelu"),
"BiasGelu",
"fused Add and Gelu",
{add_node.MutableInputDefs()},
gelu_input,
{},
{},
kMSDomain);

View file

@ -11,10 +11,10 @@ namespace onnxruntime {
@Class GeluFusion
Fuse Add + Gelu to GeluFusion
*/
class AddGeluFusion : public GraphTransformer {
class BiasGelu : public GraphTransformer {
public:
AddGeluFusion(const std::unordered_set<std::string>& compatible_execution_providers = {}) noexcept
: GraphTransformer("AddGeluFusion", compatible_execution_providers) {
BiasGelu(const std::unordered_set<std::string>& compatible_execution_providers = {}) noexcept
: GraphTransformer("BiasGelu", compatible_execution_providers) {
}
Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;

View file

@ -0,0 +1,270 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/optimizer/initializer.h"
#include "core/optimizer/embed_layer_norm_fusion.h"
#include "core/graph/graph_utils.h"
#include "float.h"
#define DEBUG_LOG(x) LOGS(logger, VERBOSE) << x
using namespace ONNX_NAMESPACE;
using namespace onnxruntime::common;
namespace onnxruntime {
// Add a Cast to convert Input from int64 to int32.
static NodeArg* CastToInt32(Graph& graph, NodeArg* input, ProviderType provider_type) {
const TensorShapeProto* input_shape = input->Shape();
TypeProto input_int32;
input_int32.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT32);
auto dim0 = input_int32.mutable_tensor_type()->mutable_shape()->add_dim();
*dim0 = input_shape->dim(0);
auto dim1 = input_int32.mutable_tensor_type()->mutable_shape()->add_dim();
*dim1 = input_shape->dim(1);
auto& cast32 = graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(input->Name() + "_Int32"), &input_int32);
Node& node = graph.AddNode(graph.GenerateNodeName(input->Name() + "_Cast"),
"Cast",
"Cast Input from int64 to int32",
{input},
{&cast32},
nullptr,
kOnnxDomain);
// Add attribute: "to" = 6
ONNX_NAMESPACE::AttributeProto to;
to.set_name("to");
to.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
to.set_i(static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_INT32));
node.AddAttribute("to", to);
node.SetExecutionProviderType(provider_type);
return &cast32;
}
static NodeArg* CheckInput(Graph& graph, NodeArg* input, ProviderType provider_type, const logging::Logger& logger) {
// Validate input shape (batch_size, sequence_length) and data type.
// Note that batch_size and sequence_length could be symbolic.
const TensorShapeProto* input_shape = input->Shape();
if (input_shape == nullptr || input_shape->dim_size() != 2 || input->Type() == nullptr) {
DEBUG_LOG("Mask shape is unknown or not 2D, or data type unknown");
return nullptr;
}
auto data_type = input->TypeAsProto()->tensor_type().elem_type();
if (data_type != ONNX_NAMESPACE::TensorProto_DataType_INT64 &&
data_type != ONNX_NAMESPACE::TensorProto_DataType_INT32) {
DEBUG_LOG("Input data type is not int32 or int64");
return nullptr;
}
if (data_type == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
return CastToInt32(graph, input, provider_type);
}
return input;
}
/**
Embed Layer Normalization will fuse embeddings and mask processing into one node :
The embeddings before conversion:
(input_ids) --------> Gather ----------+ (segment_ids)
| | |
| v v
+--> Shape --> Expand -> Gather---->Add Gather
| ^ | |
| | v v
+---(optional graph) SkipLayerNormalization
*/
Status EmbedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
GraphViewer graph_viewer(graph);
const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
for (auto node_index : node_topology_list) {
auto* p_layer_norm = graph.GetNode(node_index);
if (p_layer_norm == nullptr)
continue; // we removed the node as part of an earlier fusion
Node& layer_norm_node = *p_layer_norm;
ORT_RETURN_IF_ERROR(Recurse(layer_norm_node, modified, graph_level, logger));
if (!graph_utils::IsSupportedOptypeVersionAndDomain(layer_norm_node, "LayerNormalization", {1}, kOnnxDomain) ||
!graph_utils::IsSupportedProvider(layer_norm_node, GetCompatibleExecutionProviders())) {
continue;
}
// Find Attention after SkipLayerNormalization
const Node* p_attention = graph_utils::FirstChildByType(layer_norm_node, "Attention");
// Stop EmbedLayerNormalization fusion if Attention is not found.
if (p_attention == nullptr) {
return Status::OK();
}
Node& attention_node = *graph.GetNode(p_attention->Index());
if (!graph_utils::IsSupportedOptypeVersionAndDomain(attention_node, "Attention", {1}, kMSDomain) ||
!graph_utils::IsSupportedProvider(attention_node, GetCompatibleExecutionProviders())) {
continue;
}
// Find ReduceSum --> Attention
std::vector<const Node::EdgeEnd*> edges;
if (!graph_utils::FindPath(attention_node, true, {{0, 3, "ReduceSum", {1, 11}, kOnnxDomain}}, edges, logger)) {
continue;
}
Node& reduce_sum_node = *graph.GetNode(edges[0]->GetNode().Index());
// Find Add --> LayerNormalization
if (!graph_utils::FindPath(layer_norm_node, true, {{0, 0, "Add", {7}, kOnnxDomain}}, edges, logger)) {
continue;
}
Node& layer_norm_add_node = *graph.GetNode(edges[0]->GetNode().Index());
// Traceback the SkipLayerNormalization node to find Gather --> SkipLayerNormalization
std::vector<graph_utils::EdgeEndToMatch> segment_embedding_path{
{0, 1, "Gather", {1, 11}, kOnnxDomain}};
if (!graph_utils::FindPath(layer_norm_add_node, true, segment_embedding_path, edges, logger)) {
continue;
}
Node& segment_gather_node = *graph.GetNode(edges[0]->GetNode().Index());
if (segment_gather_node.GetOutputEdgesCount() != 1) {
continue;
}
// The first input of segment_gather_node must be 2d.
auto sg_shape = segment_gather_node.MutableInputDefs()[0]->Shape();
if (sg_shape != nullptr && sg_shape->dim_size() != 2) {
continue;
}
// Traceback the SkipLayerNormalization node to find Gather --> Add --> SkipLayerNormalization
std::vector<graph_utils::EdgeEndToMatch> word_embedding_path{
{0, 0, "Add", {7}, kOnnxDomain},
{0, 0, "Gather", {1, 11}, kOnnxDomain}};
if (!graph_utils::FindPath(layer_norm_add_node, true, word_embedding_path, edges, logger)) {
continue;
}
Node& add_node = *graph.GetNode(edges[0]->GetNode().Index());
Node& word_gather_node = *graph.GetNode(edges[1]->GetNode().Index());
if (add_node.GetOutputEdgesCount() != 1 || word_gather_node.GetOutputEdgesCount() != 1) {
continue;
}
// The first input of word_gather_node must be 2d.
auto wg_shape = word_gather_node.MutableInputDefs()[0]->Shape();
if (wg_shape != nullptr && wg_shape->dim_size() != 2) {
continue;
}
// Traceback the Add node to find (Shape --> Expand -->) Gather --> Add.
// Constant folding removes Shape and Expand nodes when input does not have symbolic shape. In that
// case just look for Gather --> Add.
std::vector<graph_utils::EdgeEndToMatch> position_embedding_path{
{0, 1, "Gather", {1, 11}, kOnnxDomain}};
if (!graph_utils::FindPath(add_node, true, position_embedding_path, edges, logger)) {
continue;
}
Node& position_gather_node = *graph.GetNode(edges[0]->GetNode().Index());
if (position_gather_node.GetOutputEdgesCount() != 1) {
continue;
}
// The first input of position_gather_node must be 2d.
auto pg_shape = position_gather_node.MutableInputDefs()[0]->Shape();
if (pg_shape != nullptr && pg_shape->dim_size() != 2) {
continue;
}
// Match Shape --> Expand path if needed.
std::vector<graph_utils::EdgeEndToMatch> position_embedding_path_symbolic{
{0, 1, "Expand", {8}, kOnnxDomain},
{0, 1, "Shape", {1}, kOnnxDomain}};
Node* p_expand_node = nullptr;
Node* p_shape_node = nullptr;
if (graph_utils::FindPath(position_gather_node, true, position_embedding_path_symbolic, edges, logger)) {
if (edges[0]->GetNode().GetOutputEdgesCount() == 1 && edges[1]->GetNode().GetOutputEdgesCount() == 1) {
p_expand_node = graph.GetNode(edges[0]->GetNode().Index());
p_shape_node = graph.GetNode(edges[1]->GetNode().Index());
}
}
// Get input "input_ids" from node.
NodeArg* input_ids = CheckInput(graph, word_gather_node.MutableInputDefs()[1], layer_norm_node.GetExecutionProviderType(), logger);
if (input_ids == nullptr) {
DEBUG_LOG("Input id is not valid. ");
continue;
}
// Get input "segment_ids" from node.
NodeArg* segment_ids = CheckInput(graph, segment_gather_node.MutableInputDefs()[1], layer_norm_node.GetExecutionProviderType(), logger);
if (segment_ids == nullptr) {
DEBUG_LOG("Segment id is not valid. ");
continue;
}
// Get input "mask" from "ReduceSum" node.
NodeArg* mask = CheckInput(graph, reduce_sum_node.MutableInputDefs()[0], layer_norm_node.GetExecutionProviderType(), logger);
if (mask == nullptr) {
DEBUG_LOG("Mask is not valid. ");
continue;
}
const std::vector<NodeArg*> embed_layer_norm_input_defs{
input_ids,
segment_ids,
mask,
word_gather_node.MutableInputDefs()[0],
position_gather_node.MutableInputDefs()[0],
segment_gather_node.MutableInputDefs()[0],
layer_norm_node.MutableInputDefs()[1],
layer_norm_node.MutableInputDefs()[2]};
Node& embed_layer_norm_node = graph.AddNode(graph.GenerateNodeName("EmbedLayerNormalization"),
"EmbedLayerNormalization",
"fused EmbedLayerNorm subgraphs ",
embed_layer_norm_input_defs,
{layer_norm_node.MutableOutputDefs()[0], reduce_sum_node.MutableOutputDefs()[0]},
{}, kMSDomain);
// Assign provider to this new node. Provider should be same as the provider for old node.
embed_layer_norm_node.SetExecutionProviderType(layer_norm_node.GetExecutionProviderType());
// move input edges to gather (first in list) across to the embed_layer_norm_node.
// move output definitions and output edges to embed_layer_norm_node.
// remove all the other nodes.
std::vector<NodeIndex> nodes_to_remove;
if (p_shape_node != nullptr && p_expand_node != nullptr) {
// Match Shape --> Gather --> Unsqueeze --> ConstantOfShape --> NonZero --> Transpose --> Squeeze --> Cast --> Unsqueeze --> Expand
if (p_expand_node != nullptr) {
Node& expand_node = *graph.GetNode(p_expand_node->Index());
std::vector<graph_utils::EdgeEndToMatch> expand_parent_path{
{0, 0, "Unsqueeze", {1, 11}, kOnnxDomain},
{0, 0, "Cast", {9}, kOnnxDomain},
{0, 0, "Squeeze", {1}, kOnnxDomain},
{0, 0, "Transpose", {1}, kOnnxDomain},
{0, 0, "NonZero", {9}, kOnnxDomain},
{0, 0, "ConstantOfShape", {9}, kOnnxDomain},
{0, 0, "Unsqueeze", {1, 11}, kOnnxDomain},
{0, 0, "Gather", {1, 11}, kOnnxDomain},
{0, 0, "Shape", {1}, kOnnxDomain},
};
if (graph_utils::FindPath(expand_node, true, expand_parent_path, edges, logger)) {
for (size_t i = 0; i < edges.size(); i++) {
if (edges[i]->GetNode().GetOutputEdgesCount() != 1) {
nodes_to_remove.clear();
break;
}
nodes_to_remove.push_back(edges[i]->GetNode().Index());
}
}
}
nodes_to_remove.push_back(p_shape_node->Index());
nodes_to_remove.push_back(p_expand_node->Index());
}
nodes_to_remove.push_back(word_gather_node.Index());
nodes_to_remove.push_back(position_gather_node.Index());
nodes_to_remove.push_back(segment_gather_node.Index());
nodes_to_remove.push_back(add_node.Index());
nodes_to_remove.push_back(reduce_sum_node.Index());
nodes_to_remove.push_back(layer_norm_add_node.Index());
nodes_to_remove.push_back(layer_norm_node.Index());
for (const auto& index : nodes_to_remove) {
Node* node = graph.GetNode(index);
graph_utils::RemoveNodeOutputEdges(graph, *node);
graph.RemoveNode(node->Index());
}
modified = true;
}
return Status::OK();
}
} // namespace onnxruntime

View file

@ -0,0 +1,24 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/optimizer/graph_transformer.h"
namespace onnxruntime {
/**
@Class EmbedLayerNormFusion
Rewrite graph fusing embeddings and mask processing into one node.
*/
class EmbedLayerNormFusion : public GraphTransformer {
public:
EmbedLayerNormFusion(const std::unordered_set<std::string>& compatible_execution_providers = {}) noexcept
: GraphTransformer("EmbedLayerNormFusion", compatible_execution_providers) {}
Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
};
} // namespace onnxruntime

View file

@ -0,0 +1,119 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/optimizer/initializer.h"
#include "core/optimizer/gelu_approximation.h"
#include "core/framework/tensorprotoutils.h"
#include "core/optimizer/utils.h"
#include "core/graph/graph_utils.h"
#include "float.h"
using namespace ONNX_NAMESPACE;
using namespace onnxruntime::common;
namespace onnxruntime {
// FastGelu supports limited data types.
static std::vector<std::string> supported_data_types{"tensor(float16)", "tensor(float)"};
static bool IsSupportedDataType(const Node& node) {
for (const auto& input_arg : node.InputDefs()) {
if (std::find(supported_data_types.begin(), supported_data_types.end(),
*(input_arg->Type())) == supported_data_types.end()) {
return false;
}
}
return true;
}
static bool CheckInputShape(const Node& node, const NodeArg& input, const NodeArg& bias) {
const TensorShapeProto* bias_shape = bias.Shape();
if (nullptr == bias_shape ||
bias_shape->dim_size() != 1 ||
!utils::HasDimValue(bias_shape->dim(0))) {
return false;
}
auto bias_length = bias_shape->dim(0).dim_value();
const TensorShapeProto* input_shape = input.Shape();
if (nullptr != input_shape) {
if (input_shape->dim_size() >= 1) {
int last_dim = input_shape->dim_size() - 1;
if (utils::HasDimValue(input_shape->dim(last_dim)) &&
input_shape->dim(last_dim).dim_value() == bias_length) {
return true;
}
}
return false;
}
// Input does not have shape. We will check its parent node.
// When the parent is MatMul and its 2nd input has shape like {*, bias_length},
// it means that the shape of MatMul output is good for FastGelu.
const Node* parent_node = graph_utils::GetInputNode(node, 0);
if (nullptr != parent_node &&
graph_utils::IsSupportedOptypeVersionAndDomain(*parent_node, "MatMul", {1, 9}, kOnnxDomain)) {
const NodeArg& input_b = *(parent_node->InputDefs()[1]);
if (optimizer_utils::ValidateShape(input_b, {-1, bias_length})) {
return true;
}
}
return false;
}
static bool CheckGeluInputShape(const NodeArg& input) {
const TensorShapeProto* input_shape = input.Shape();
return nullptr != input_shape && input_shape->dim_size() >= 1;
}
static bool IsCandidateNode(const Node& node, const std::unordered_set<std::string>& compatible_providers) {
if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "BiasGelu", {1}, kMSDomain)) {
return graph_utils::IsSupportedProvider(node, compatible_providers) &&
IsSupportedDataType(node) &&
CheckInputShape(node, *(node.InputDefs()[0]), *(node.InputDefs()[1]));
} else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "Gelu", {1}, kMSDomain)) {
return graph_utils::IsSupportedProvider(node, compatible_providers) &&
IsSupportedDataType(node) &&
CheckGeluInputShape(*(node.InputDefs()[0]));
}
return false;
}
Status GeluApproximation::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
GraphViewer graph_viewer(graph);
const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
int count = 0;
for (auto node_index : node_topology_list) {
auto* p_node = graph.GetNode(node_index);
if (p_node == nullptr)
continue; // we removed the node as part of an earlier fusion
Node& node = *p_node;
ORT_RETURN_IF_ERROR(Recurse(node, modified, graph_level, logger));
if (IsCandidateNode(node, GetCompatibleExecutionProviders())) {
Node& fastgelu = graph.AddNode(
graph.GenerateNodeName("FastGelu"),
"FastGelu",
"Gelu approximation",
node.MutableInputDefs(),
node.MutableOutputDefs(), nullptr, kMSDomain);
fastgelu.SetExecutionProviderType(node.GetExecutionProviderType());
graph_utils::RemoveNodeOutputEdges(graph, node);
graph.RemoveNode(node.Index());
count++;
}
}
if (count > 0) {
modified = true;
LOGS(logger, INFO) << "Total Gelu Approximation (FastGelu) node count: " << count;
}
return Status::OK();
}
} // namespace onnxruntime

View file

@ -0,0 +1,24 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/optimizer/graph_transformer.h"
namespace onnxruntime {
/**
@Class GeluApproximation
Rewrite graph to replace Gelu or AddGeluFusion by FastGelu node. FastGelu uses approximation for Gelu,
and it is faster.
*/
class GeluApproximation : public GraphTransformer {
public:
GeluApproximation(const std::unordered_set<std::string>& compatible_execution_providers={}) noexcept
: GraphTransformer("GeluApproximation", compatible_execution_providers) {}
Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
};
} // namespace onnxruntime

View file

@ -18,10 +18,12 @@
#include "core/optimizer/shape_to_initializer.h"
#include "core/optimizer/nchwc_transformer.h"
#include "core/optimizer/free_dim_override_transformer.h"
#include "core/optimizer/add_gelu_fusion.h"
#include "core/optimizer/bias_gelu_fusion.h"
#include "core/optimizer/gelu_fusion.h"
#include "core/optimizer/gelu_approximation.h"
#include "core/optimizer/layer_norm_fusion.h"
#include "core/optimizer/skip_layer_norm_fusion.h"
#include "core/optimizer/embed_layer_norm_fusion.h"
#include "core/optimizer/reshape_fusion.h"
#include "core/optimizer/attention_fusion.h"
#include "core/mlas/inc/mlas.h"
@ -127,11 +129,12 @@ std::vector<std::unique_ptr<GraphTransformer>> GenerateTransformers(TransformerL
transformers.emplace_back(onnxruntime::make_unique<GeluFusion>(cpu_cuda_execution_providers));
transformers.emplace_back(onnxruntime::make_unique<LayerNormFusion>(cpu_cuda_execution_providers));
transformers.emplace_back(onnxruntime::make_unique<AttentionFusion>(cpu_cuda_execution_providers));
transformers.emplace_back(onnxruntime::make_unique<EmbedLayerNormFusion>(cpu_cuda_execution_providers));
transformers.emplace_back(onnxruntime::make_unique<BiasGelu>(cpu_cuda_execution_providers));
transformers.emplace_back(onnxruntime::make_unique<SkipLayerNormFusion>(cpu_cuda_execution_providers));
std::unordered_set<std::string> cuda_execution_providers = {onnxruntime::kCudaExecutionProvider};
transformers.emplace_back(onnxruntime::make_unique<AddGeluFusion>(cuda_execution_providers));
transformers.emplace_back(onnxruntime::make_unique<SkipLayerNormFusion>(cuda_execution_providers));
transformers.emplace_back(onnxruntime::make_unique<GeluApproximation>(cuda_execution_providers));
#endif
} break;
@ -142,7 +145,6 @@ std::vector<std::unique_ptr<GraphTransformer>> GenerateTransformers(TransformerL
transformers.emplace_back(onnxruntime::make_unique<NchwcTransformer>());
}
#endif
} break;
default:

View file

@ -8,12 +8,9 @@
namespace onnxruntime {
/**
@Class LayerNormFusion
@Class SkipLayerNormFusion
Rewrite graph fusing Layer Normalization subgraph to a single LayerNormalization node.
The formula corresponding to LayerNorm activation subgraph:
(x - mean(x, axis)) / sqrt(var(x, axis)) * scale + bias, where x is the input.
Rewrite graph fusing Add + Layer Normalization subgraph to a single SkipLayerNormalization node.
*/
class SkipLayerNormFusion : public GraphTransformer {

View file

@ -42,17 +42,6 @@ ADD_TYPED_ROIALIGN_OP(float);
ADD_TYPED_ROIALIGN_OP(double);
namespace {
template <typename T>
void TryParallelFor(concurrency::ThreadPool* tp, int32_t total, T&& fn) {
if (tp != nullptr)
tp->ParallelFor(total, fn);
else {
for (int32_t i = 0; i != total; ++i) {
fn(i);
}
}
}
template <typename T>
struct PreCalc {
int64_t pos1;
@ -183,9 +172,7 @@ void RoiAlignForward(const TensorShape& output_shape,
int64_t pooled_height = output_shape[2];
int64_t pooled_width = output_shape[3];
// TODO: This should do blocks of work based on the number of threads in the threadpool with each block
// being n_rois / num_threads
std::function<void(int32_t)> work_object = [&](int32_t n) {
ThreadPool::TryBatchParallelFor(ttp, static_cast<int32_t>(n_rois), [&](int32_t n) {
int64_t index_n = n * channels * pooled_width * pooled_height;
const T* offset_bottom_rois = bottom_rois + n * num_roi_cols;
@ -281,9 +268,7 @@ void RoiAlignForward(const TensorShape& output_shape,
} // for pw
} // for ph
} // for c
}; // for n
TryParallelFor(ttp, static_cast<int32_t>(n_rois), work_object);
}); // for n
}
} // namespace

View file

@ -300,9 +300,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Greater);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Greater);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Greater);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, bool, Equal);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, int32_t, Equal);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, int64_t, Equal);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, bool, Equal);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, int32_t, Equal);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, int64_t, Equal);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, Expand);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int32_t, Greater);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int64_t, Greater);
@ -363,7 +363,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Ceil);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, Ceil);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, Ceil);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Clip);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 10, float, Clip);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Reciprocal);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, Reciprocal);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, Reciprocal);
@ -389,12 +389,12 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, LRN);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, LRN);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, LRN);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, Conv);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, Conv);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Conv);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, ConvTranspose);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, ConvTranspose);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, ConvTranspose);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, Conv);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, Conv);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, Conv);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, ConvTranspose);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ConvTranspose);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ConvTranspose);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, float, AveragePool);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, double, AveragePool);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, MLFloat16, AveragePool);
@ -549,13 +549,13 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDom
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scan);
// opset 10
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, AveragePool);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, AveragePool);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, MLFloat16, AveragePool);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, double, AveragePool);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, MLFloat16, AveragePool);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, Dropout);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, MaxPool);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, MaxPool);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, MLFloat16, MaxPool);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, MaxPool);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, double, MaxPool);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, MLFloat16, MaxPool);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, NonMaxSuppression);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, Resize);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, Resize);
@ -640,6 +640,22 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, S
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Squeeze);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, TopK);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Unsqueeze);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Conv);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Conv);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Conv);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, ConvTranspose);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, ConvTranspose);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, ConvTranspose);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, AveragePool);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, AveragePool);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, AveragePool);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, MaxPool);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, MaxPool);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, MaxPool);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Clip);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, bool, Equal);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int32_t, Equal);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int64_t, Equal);
static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
static const BuildKernelCreateInfoFn function_table[] = {
@ -664,7 +680,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, MatMul)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, double, MatMul)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, MatMul)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Clip)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 10, float, Clip)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Tile)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, Tile)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, Tile)>,
@ -736,9 +752,9 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Greater)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Greater)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Greater)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, bool, Equal)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, int32_t, Equal)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, int64_t, Equal)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, bool, Equal)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, int32_t, Equal)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, int64_t, Equal)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, Expand)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int32_t, Greater)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int64_t, Greater)>,
@ -824,12 +840,12 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, LRN)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, LRN)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, LRN)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, Conv)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, Conv)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Conv)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, ConvTranspose)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, ConvTranspose)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, ConvTranspose)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, Conv)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, Conv)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, Conv)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, ConvTranspose)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ConvTranspose)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ConvTranspose)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, float, AveragePool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, double, AveragePool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, MLFloat16, AveragePool)>,
@ -983,13 +999,13 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scan)>,
// opset 10
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, AveragePool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, AveragePool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, MLFloat16, AveragePool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, double, AveragePool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, MLFloat16, AveragePool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, Dropout)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, MaxPool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, MaxPool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, MLFloat16, MaxPool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, MaxPool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, double, MaxPool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, MLFloat16, MaxPool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, NonMaxSuppression)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, Resize)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, Resize)>,
@ -1075,6 +1091,22 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Squeeze)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, TopK)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Unsqueeze)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Conv)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Conv)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Conv)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, ConvTranspose)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, ConvTranspose)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, ConvTranspose)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, AveragePool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, AveragePool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, AveragePool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, MaxPool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, MaxPool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, MaxPool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Clip)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, bool, Equal)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int32_t, Equal)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int64_t, Equal)>,
};
for (auto& function_table_entry : function_table) {

View file

@ -176,6 +176,11 @@ Status BinaryElementwise<ShouldBroadcast>::Prepare(OpKernelContext* context, Bin
BINARY_ELEMENTWISE_REGISTER_KERNEL_TYPED(name, ver, int32_t) \
BINARY_ELEMENTWISE_REGISTER_KERNEL_TYPED(name, ver, int64_t)
#define BINARY_OP_REGISTER_VERSIONED_OIL(name, startver, endver) \
BINARY_ELEMENTWISE_REGISTER_KERNEL_VERSIONED_TYPED(name, startver, endver, bool) \
BINARY_ELEMENTWISE_REGISTER_KERNEL_VERSIONED_TYPED(name, startver, endver, int32_t) \
BINARY_ELEMENTWISE_REGISTER_KERNEL_VERSIONED_TYPED(name, startver, endver, int64_t)
#define BINARY_LOGICALOP_REGISTER_OIL(name, ver) \
BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(name, ver, bool) \
BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(name, ver, int32_t) \
@ -379,7 +384,8 @@ Status Less<T>::ComputeInternal(OpKernelContext* context) const {
BINARY_OP_REGISTER_UZILHFD(Sum, 8)
BINARY_OP_REGISTER_VERSIONED_UZILHFD(Sum, 6, 7)
BINARY_LOGICALOP_REGISTER_UZILHFD(Greater, 9)
BINARY_OP_REGISTER_OIL(Equal, 7)
BINARY_OP_REGISTER_VERSIONED_OIL(Equal, 7, 10)
BINARY_OP_REGISTER_OIL(Equal, 11)
BINARY_OP_REGISTER_VERSIONED_HFD(Greater, 7, 8)
BINARY_OP_REGISTER_HFD(Max, 8)
BINARY_OP_REGISTER_VERSIONED_HFD(Max, 6, 7)

View file

@ -8,38 +8,61 @@
namespace onnxruntime {
namespace cuda {
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
Clip, \
kOnnxDomain, \
6, \
T, \
kCudaExecutionProvider, \
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Clip, \
kOnnxDomain, \
6, \
10, \
T, \
kCudaExecutionProvider, \
KernelDefBuilder() \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Clip<T>); \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
Clip, \
kOnnxDomain, \
11, \
T, \
kCudaExecutionProvider, \
KernelDefBuilder() \
.InputMemoryType<OrtMemTypeCPUInput>(1) \
.InputMemoryType<OrtMemTypeCPUInput>(2) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Clip<T>);
template <typename T>
Status Clip<T>::ComputeInternal(OpKernelContext* ctx) const {
T min_val = min_;
T max_val = max_;
if (is_min_max_input_) {
const auto* min_input = ctx->Input<Tensor>(1);
const auto* max_input = ctx->Input<Tensor>(2);
if (min_input) {
ORT_ENFORCE(min_input->Shape().NumDimensions() == 0, "min should be a scalar.");
min_val = *(min_input->template Data<T>());
}
if (max_input) {
ORT_ENFORCE(max_input->Shape().NumDimensions() == 0, "max should be a scalar.");
max_val = *(max_input->template Data<T>());
}
ORT_ENFORCE(min_val <= max_val);
}
const Tensor& X = *ctx->Input<Tensor>(0);
const TensorShape input_shape{X.Shape()};
const TensorShape& input_shape{X.Shape()};
Tensor* Y = ctx->Output(0, input_shape);
size_t count = input_shape.Size();
if (count > 0) {
auto* y_data = Y->template MutableData<T>();
const auto* x_data = X.template Data<T>();
ClipImpl<T>(x_data, y_data, min_, max_, count);
ClipImpl<T>(x_data, y_data, min_val, max_val, count);
}
return Status::OK();
}
#define SPECIALIZED_COMPUTE(T) \
REGISTER_KERNEL_TYPED(T) \
template Status Clip<T>::ComputeInternal(OpKernelContext* ctx) const;
SPECIALIZED_COMPUTE(float)
REGISTER_KERNEL_TYPED(float)
} // namespace cuda
} // namespace onnxruntime

View file

@ -10,21 +10,29 @@ namespace cuda {
template <typename T>
class Clip final : public CudaKernel {
public:
Clip(const OpKernelInfo& info) : CudaKernel{info} {
auto min_val = -std::numeric_limits<T>::infinity();
auto max_val = std::numeric_limits<T>::infinity();
Clip(const OpKernelInfo& info) : CudaKernel{info}, is_min_max_input_(false) {
int start_version;
int end_version;
info.GetKernelDef().SinceVersion(&start_version, &end_version);
info.GetAttrOrDefault("min", &min_, min_val);
info.GetAttrOrDefault("max", &max_, max_val);
// Make sure the range of interval is sensible
ORT_ENFORCE(min_val <= max_val);
if (start_version < 11) {
auto min_val = -std::numeric_limits<T>::infinity();
auto max_val = std::numeric_limits<T>::infinity();
info.GetAttrOrDefault("min", &min_, min_val);
info.GetAttrOrDefault("max", &max_, max_val);
ORT_ENFORCE(min_ <= max_);
} else {
min_ = -std::numeric_limits<T>::infinity();
max_ = std::numeric_limits<T>::infinity();
is_min_max_input_ = true;
}
}
Status ComputeInternal(OpKernelContext* context) const override;
private:
T min_, max_;
bool is_min_max_input_;
};
} // namespace cuda

View file

@ -1,7 +1,6 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/cuda/math/clip_impl.h"
#include "core/providers/cuda/cu_inc/common.cuh"

View file

@ -9,11 +9,21 @@
namespace onnxruntime {
namespace cuda {
// Op Set 11 for Conv only update document to clearify default dilations and strides value.
// which are already convered by op set 11 cpu versoin, so simply add declaration.
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Conv, \
kOnnxDomain, \
1, 10, \
T, \
kCudaExecutionProvider, \
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Conv<T>); \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
Conv, \
kOnnxDomain, \
1, \
11, \
T, \
kCudaExecutionProvider, \
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \

View file

@ -6,11 +6,21 @@
namespace onnxruntime {
namespace cuda {
// Op Set 11 for ConvTranspose only update document to clearify default dilations and strides value.
// which are already covered by op set 11 cpu version, so simply add declaration.
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
ConvTranspose, \
kOnnxDomain, \
1, 10, \
T, \
kCudaExecutionProvider, \
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
ConvTranspose<T>); \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
ConvTranspose, \
kOnnxDomain, \
1, \
11, \
T, \
kCudaExecutionProvider, \
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \

View file

@ -30,12 +30,17 @@ namespace cuda {
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()).TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()), \
Pool<data_type, pool_type>);
POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 7, 9)
POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 7, 9)
POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 7, 9)
POOLING_KERNEL(AveragePool, float, AveragePool, 10)
POOLING_KERNEL(AveragePool, double, AveragePool, 10)
POOLING_KERNEL(AveragePool, MLFloat16, AveragePool, 10)
POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 10, 10)
POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 10, 10)
POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 10, 10)
//AveragePool and MaxPool op set 11 only update spec document on default value for dilations and strides.
POOLING_KERNEL(AveragePool, float, AveragePool, 11)
POOLING_KERNEL(AveragePool, double, AveragePool, 11)
POOLING_KERNEL(AveragePool, MLFloat16, AveragePool, 11)
POOLING_KERNEL(GlobalAveragePool, float, AveragePool, 1)
POOLING_KERNEL(GlobalAveragePool, double, AveragePool, 1)
POOLING_KERNEL(GlobalAveragePool, MLFloat16, AveragePool, 1)
@ -45,9 +50,12 @@ POOLING_KERNEL_VERSIONED(MaxPool, MLFloat16, MaxPool<1>, 1, 7)
POOLING_KERNEL_VERSIONED(MaxPool, float, MaxPool<8>, 8, 9)
POOLING_KERNEL_VERSIONED(MaxPool, double, MaxPool<8>, 8, 9)
POOLING_KERNEL_VERSIONED(MaxPool, MLFloat16, MaxPool<8>, 8, 9)
POOLING_KERNEL(MaxPool, float, MaxPool<8>, 10)
POOLING_KERNEL(MaxPool, double, MaxPool<8>, 10)
POOLING_KERNEL(MaxPool, MLFloat16, MaxPool<8>, 10)
POOLING_KERNEL_VERSIONED(MaxPool, float, MaxPool<8>, 10, 10)
POOLING_KERNEL_VERSIONED(MaxPool, double, MaxPool<8>, 10, 10)
POOLING_KERNEL_VERSIONED(MaxPool, MLFloat16, MaxPool<8>, 10, 10)
POOLING_KERNEL(MaxPool, float, MaxPool<8>, 11)
POOLING_KERNEL(MaxPool, double, MaxPool<8>, 11)
POOLING_KERNEL(MaxPool, MLFloat16, MaxPool<8>, 11)
POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1)
POOLING_KERNEL(GlobalMaxPool, double, MaxPool<1>, 1)
POOLING_KERNEL(GlobalMaxPool, MLFloat16, MaxPool<1>, 1)

View file

@ -38,7 +38,8 @@ static const std::unordered_set<std::string> valid_keys = {
kNupharCacheSoName,
kNupharCacheModelChecksum,
kNupharCacheForceNoJIT,
kNupharCodeGenTarget};
kNupharCodeGenTarget,
kNupharParallelMinWorkloads};
void SetDefaultOptions(std::map<std::string, std::string>& options) {
// create two temporary strings to get rid of the odr-use issue introduced
@ -56,6 +57,20 @@ void SetDefaultOptions(std::map<std::string, std::string>& options) {
std::string cache_so_name_opt(kNupharCacheSoName);
std::string cache_so_name_default(kNupharCacheSoName_Default);
options.insert(std::make_pair(cache_so_name_opt, cache_so_name_default));
std::string parallel_min_workloads_opt(kNupharParallelMinWorkloads);
#if defined(USE_OPENMP) || defined(USE_MKLML)
// a rough estimate of workloads based on static dimensions for each thread, when using parallel schedule
// user may change it to 0 to turn it off,
// or use OMP_NUM_THREADS to control TVM thread pool similar to control MKL
unsigned int parallel_min_workloads_default = 64;
#else
// turn off parallel schedule by default to avoid TVM thread pool confliction with others
// this is to ensure performance when user runs multiple inference threads, with each runs as single thread
// if needed, user can override it with settings, and use TVM_NUM_THREADS to control the thread pool
unsigned int parallel_min_workloads_default = 0;
#endif
options.insert(std::make_pair(parallel_min_workloads_opt, std::to_string(parallel_min_workloads_default)));
}
void CreateNupharCodeGenSettings(const NupharExecutionProviderInfo& info) {

View file

@ -45,6 +45,9 @@ constexpr static const char* kNupharActivations_DeepCpu = "deep_cpu_activation";
// Option to control nuphar code generation target (avx / avx2 / avx512)
constexpr static const char* kNupharCodeGenTarget = "nuphar_codegen_target";
// Option to control nuphar code to run with parallel schedule
constexpr static const char* kNupharParallelMinWorkloads = "nuphar_parallel_min_workloads";
// cache version number (MAJOR.MINOR.PATCH) following https://semver.org/
// 1. MAJOR version when you make incompatible changes that old cache files no longer work,
// 2. MINOR version when you add functionality in a backwards - compatible manner, and

View file

@ -28,7 +28,7 @@ struct NupharCodeGenHandle : codegen::CodeGenHandle {
std::shared_ptr<tvm_codegen::TVMScheduleBuilder> schedule_builder; // keep
// maybe add a layout
tvm_codegen::WeightLayoutRegistry* layout_registry;
bool enable_per_node_parallelized; // TODO: change to config
int64_t parallel_min_workloads;
bool allow_unaligned_buffers; // move to another place

View file

@ -9,6 +9,7 @@
#include "core/providers/nuphar/common/analysis/subgraph_codegen_stats.h"
#include "core/providers/nuphar/compiler/x86/x86_target_info.h"
#include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h"
// TODO change name space
namespace onnxruntime {
@ -37,9 +38,8 @@ static void Traverse(const tvm::Tensor& tensor,
if (is_real_output) {
CodeGenTargetX86* target = dynamic_cast<CodeGenTargetX86*>(ctx_codegen.GetCodeGenHandle()->codegen_target);
ORT_ENFORCE(target != nullptr);
int64_t natural_vector_size = target->NaturalVectorWidth(tensor->dtype.bits());
TryVectorization(tensor, natural_vector_size, ctx_schedule); // to x86
TryVectorizationX86(tensor, ctx_codegen, ctx_schedule);
InsertRootScheduleAndClosure(tensor, ctx_schedule);
}

View file

@ -39,5 +39,13 @@ bool InputRootScheduleWithVectorizationX86(
tvm_codegen::CodeGenContext& ctx_codegen,
tvm_codegen::ScheduleContext& ctx_sched);
bool TryParallelX86(
const tvm::Tensor& tensor,
int64_t to_dim, // fuse dims before to_dim for parallel schedule, 0 to fuse all but last dim
tvm_codegen::CodeGenContext& ctx_codegen,
tvm_codegen::ScheduleContext& ctx_sched);
constexpr auto kNupharScheduleNoParallel = "nuphar_schedule_no_parallel";
} // namespace nuphar
} // namespace onnxruntime

View file

@ -3,26 +3,99 @@
#include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h"
#include "core/codegen/passes/scheduler/schedule_utils.h"
#include "core/framework/op_kernel_info.h"
#include "core/providers/nuphar/common/nuphar_settings.h"
#include "core/providers/nuphar/common/analysis/subgraph_codegen_stats.h"
#include "core/providers/nuphar/compiler/nuphar_codegen_ctx.h"
#include "core/codegen/passes/scheduler/schedule_utils.h"
#include "core/providers/nuphar/compiler/x86/scheduler/tensorize/intrin_gemv_ll_extern.h"
#include "core/providers/nuphar/compiler/x86/scheduler/tensorize/intrin_gemv_ll_ir.h"
#include "core/providers/nuphar/compiler/x86/x86_target_info.h"
#include "core/framework/op_kernel_info.h"
#include <tvm/tvm.h>
#include <tvm/ir_pass.h>
namespace onnxruntime {
namespace nuphar {
bool TryParallelX86(
const tvm::Tensor& tensor,
int64_t to_dim,
tvm_codegen::CodeGenContext& ctx_codegen,
tvm_codegen::ScheduleContext& ctx_sched) {
auto compute_op = tensor->op.as<tvm::ComputeOpNode>();
if (compute_op == nullptr) {
return false;
}
if (compute_op->attrs.count(kNupharScheduleNoParallel)) {
return false;
}
const auto& shape = tensor->shape;
int rank = gsl::narrow<int>(shape.size());
tvm::Array<tvm::IterVar> to_fuse_for_parallel;
int64_t rank_to_parallel = (to_dim ? to_dim : rank - 1);
for (int64_t i = 0; i < rank_to_parallel && i < gsl::narrow<int64_t>(compute_op->axis.size()); ++i) {
tvm::IterVar axis = compute_op->axis[i];
auto dom = axis->dom;
if (!tvm::ir::Equal(dom->extent, shape[i])) {
// only do parallel schedule on axis not being fused or split yet
rank_to_parallel = i;
break;
}
to_fuse_for_parallel.push_back(axis);
}
if (to_fuse_for_parallel.size() < 1) {
return false;
}
int64_t per_thread_static_dims = 1;
for (const auto& reduce_axis : compute_op->reduce_axis) {
const int64_t* static_range = tvm::as_const_int(reduce_axis->dom->extent);
if (static_range != nullptr) {
per_thread_static_dims *= *static_range;
}
}
for (int64_t i = rank_to_parallel; i < rank; ++i) {
auto dim = tvm::as_const_int(shape[i]);
if (dim != nullptr) {
per_thread_static_dims *= *dim;
}
}
// skip small per thread workloads, note that symbolic dims are ignored (treated as 1)
int64_t workloads_threshold = Promote<NupharCodeGenCtx>(&ctx_codegen)->GetCodeGenHandle()->parallel_min_workloads;
if (workloads_threshold <= 0 || per_thread_static_dims < workloads_threshold) {
return false;
}
tvm::IterVar parallel_axis;
if (to_fuse_for_parallel.size() > 1) {
ctx_sched.schedule[tensor->op].fuse(to_fuse_for_parallel, &parallel_axis);
} else {
parallel_axis = to_fuse_for_parallel[0];
}
ctx_sched.schedule[tensor->op].parallel(parallel_axis);
return true;
}
bool TryVectorizationX86(
const tvm::Tensor& tensor,
tvm_codegen::CodeGenContext& ctx_codegen,
tvm_codegen::ScheduleContext& ctx_sched) {
if (!ShouldTryVectorization(tensor, ctx_sched))
return false;
CodeGenTargetX86* target = dynamic_cast<CodeGenTargetX86*>(ctx_codegen.GetCodeGenHandle()->codegen_target);
ORT_ENFORCE(target != nullptr);
int64_t natural_vector_size = target->NaturalVectorWidth(tensor->dtype.bits());
// try to use parallel schedule when vectorizing
// note that we don't do logic-or in return value here
// to make sure vectorization is always tried
TryParallelX86(tensor, 0, ctx_codegen, ctx_sched);
return TryVectorization(tensor, natural_vector_size, ctx_sched);
}
@ -176,7 +249,7 @@ static Status ConvScheduleX86(const tvm::Tensor& tensor,
ctx_sched.schedule[tensor->op].reorder({b, oc_chunk, y, xo, ic_chunk, m, n, ic_block, xi, oc_block});
if (ctx_codegen.GetCodeGenHandle()->enable_per_node_parallelized) {
if (ctx_codegen.GetCodeGenHandle()->parallel_min_workloads > 0) {
tvm::Array<tvm::IterVar> fused_axis;
fused_axis.push_back(b);
fused_axis.push_back(oc_chunk);
@ -186,6 +259,7 @@ static Status ConvScheduleX86(const tvm::Tensor& tensor,
ctx_sched.schedule[tensor->op].fuse(fused_axis, &parallel_axis);
ctx_sched.schedule[tensor->op].parallel(parallel_axis);
}
ctx_sched.schedule[tensor->op].vectorize(oc_block);
return Status::OK();
@ -243,7 +317,7 @@ static Status MatMul_2DWeight_Schedule(
ctx_sched.schedule[CC->op].unroll(ki);
ctx_sched.schedule[CC->op].vectorize(yc);
if (ctx_codegen.GetCodeGenHandle()->enable_per_node_parallelized) {
if (ctx_codegen.GetCodeGenHandle()->parallel_min_workloads > 0) {
// parallelize
tvm::Array<tvm::IterVar> fused_axis;
for (size_t d = 0; d < C_rank - 2; ++d)

View file

@ -22,6 +22,7 @@ bool TVM_SCHEDULER_CLASS(Extern, NupharX86TVMRule)::Evaluate(
static bool ReduceVScheduleNupharX86(
const tvm::Tensor& tensor,
tvm_codegen::CodeGenContext& ctx_codegen,
tvm_codegen::ScheduleContext& ctx_sched) {
InsertRootScheduleAndClosure(tensor, ctx_sched);
@ -55,6 +56,8 @@ static bool ReduceVScheduleNupharX86(
if (shape.size() > 0)
head_dim = as_const_int(shape[0]);
bool try_parallel = true;
// unroll packed reduce by checking head dim
if (nullptr != head_dim) {
// if head_dim is already fused, don't unroll
@ -81,8 +84,13 @@ static bool ReduceVScheduleNupharX86(
ctx_sched.schedule[tensor->op].reorder(reorder_axis);
ctx_sched.schedule[tensor->op].unroll(x0);
try_parallel = false;
}
}
if (try_parallel) {
TryParallelX86(tensor, *fuse_dim, ctx_codegen, ctx_sched);
}
} else if (compute_op->axis.size() > 0 &&
tvm::as_const_int(tensor->shape[0]) != nullptr) {
tvm::IterVar x = compute_op->axis[0];
@ -101,7 +109,7 @@ static bool ReduceVScheduleNupharX86(
bool TVM_SCHEDULER_CLASS(Reduce, NupharX86TVMRule)::Evaluate(
const tvm::Tensor& tensor,
const Node*,
tvm_codegen::CodeGenContext&,
tvm_codegen::CodeGenContext& ctx_codegen,
tvm_codegen::ScheduleContext& ctx_sched) {
// respect topi::kCommReduce
if (tensor->op->tag == topi::kCommReduce) {
@ -109,7 +117,7 @@ bool TVM_SCHEDULER_CLASS(Reduce, NupharX86TVMRule)::Evaluate(
}
if (tensor->op->tag == nuphar::kNupharVReduce) {
return ReduceVScheduleNupharX86(tensor, ctx_sched);
return ReduceVScheduleNupharX86(tensor, ctx_codegen, ctx_sched);
}
// unknown goes to InsertRootScheduleAndClosure

View file

@ -7,6 +7,7 @@
#include "core/codegen/mti/mti_tvm_utils.h"
#include "core/codegen/mti/tensor/pad_ops.h"
#include "core/codegen/mti/tensor/reshape_ops.h"
#include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h"
#include <topi/reduction.h>
namespace onnxruntime {
@ -85,7 +86,7 @@ tvm::Tensor ReduceValueWithoutSplit(const tvm::Tensor& X,
tvm::Map<std::string, tvm::NodeRef> attrs;
attrs.Set(kNupharVReduceFuseDim, tvm::Expr(fuse_dim));
attrs.Set(kNupharScheduleNoParallel, tvm::Expr(true));
return tvm::compute(output_shape, l_out, name + "_regular_reduce", kNupharVReduce, attrs);
}

View file

@ -125,8 +125,8 @@ NupharExecutionProvider::NupharExecutionProvider(const NupharExecutionProviderIn
handle->shape_inference = whole_graph_shape_infer_;
// TODO: remove
handle->enable_per_node_parallelized = info.enable_per_node_parallel;
handle->parallel_min_workloads = std::stoi(settings.GetOptionValue(kNupharParallelMinWorkloads));
// TODO: remove
handle->allow_unaligned_buffers = info.allow_unaligned_buffers; // TODO remove this

View file

@ -31,9 +31,6 @@ constexpr const char* default_nuphar_target_str = stackvm_target_str;
// Information needed to construct Nuphar execution providers.
struct NupharExecutionProviderInfo {
// By default, let provider decide the target by passing in empty string.
bool enable_per_node_parallel; // TODO: remove
// this flag set TVM build_config with data_alignment=1, at the cost of performance
bool allow_unaligned_buffers;
@ -43,10 +40,8 @@ struct NupharExecutionProviderInfo {
std::string settings;
explicit NupharExecutionProviderInfo(bool unaligned_buffers,
const std::string& str_settings = "",
bool per_node_parallel = true)
: enable_per_node_parallel(per_node_parallel),
allow_unaligned_buffers(unaligned_buffers),
const std::string& str_settings = "")
: allow_unaligned_buffers(unaligned_buffers),
settings(str_settings) {}
NupharExecutionProviderInfo() = default;
};

View file

@ -21,7 +21,7 @@ struct NupharExecutionProviderFactory : IExecutionProviderFactory {
};
std::unique_ptr<IExecutionProvider> NupharExecutionProviderFactory::CreateProvider() {
NupharExecutionProviderInfo info(allow_unaligned_buffers_, settings_, /*per_node_parallel*/ true);
NupharExecutionProviderInfo info(allow_unaligned_buffers_, settings_);
return onnxruntime::make_unique<NupharExecutionProvider>(info);
}

View file

@ -130,7 +130,7 @@ ORT_API_STATUS_IMPL(OrtApis::SetSessionGraphOptimizationLevel, _In_ OrtSessionOp
options->value.graph_optimization_level = onnxruntime::TransformerLevel::Level2;
break;
case ORT_ENABLE_ALL:
options->value.graph_optimization_level = onnxruntime::TransformerLevel::Level3;
options->value.graph_optimization_level = onnxruntime::TransformerLevel::MaxLevel;
break;
default:
return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "graph_optimization_level is not valid");

View file

@ -102,8 +102,12 @@ InferenceSession::InferenceSession(const SessionOptions& session_options,
: session_options_(session_options),
graph_transformation_mgr_(session_options.max_num_graph_transformation_steps),
logging_manager_(logging_manager),
#ifndef USE_OPENMP
thread_pool_(concurrency::CreateThreadPool("intra_op_thread_pool",
session_options.intra_op_num_threads)),
#else
thread_pool_(nullptr),
#endif
inter_op_thread_pool_(session_options.execution_mode == ExecutionMode::ORT_PARALLEL
? concurrency::CreateThreadPool("inter_op_thread_pool",
session_options.inter_op_num_threads)
@ -240,11 +244,7 @@ common::Status InferenceSession::Load(std::function<common::Status(std::shared_p
// all steps complete, mark the model as loaded.
is_model_loaded_ = true;
// and log telemetry
const Env& env = Env::Default();
env.GetTelemetryProvider().LogSessionCreation(session_id_, model_->IrVersion(), model_->ProducerName(), model_->ProducerVersion(),
model_->Domain(), model_->MainGraph().DomainToVersionMap(), model_->MainGraph().Name(),
model_->MetaData(), event_name, execution_providers_.GetIds());
event_name_ = event_name;
} catch (const std::exception& ex) {
status = Status(common::ONNXRUNTIME, common::FAIL, "Exception during loading: " + std::string(ex.what()));
@ -404,7 +404,7 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph,
// apply transformers except default transformers
// Default transformers are required for correctness and they are owned and run by inference session
for (int i = static_cast<int>(TransformerLevel::Level1); i < static_cast<int>(TransformerLevel::MaxTransformerLevel); i++) {
for (int i = static_cast<int>(TransformerLevel::Level1); i <= static_cast<int>(TransformerLevel::MaxLevel); i++) {
ORT_RETURN_IF_ERROR_SESSIONID_(graph_transformer_mgr.ApplyTransformers(graph, static_cast<TransformerLevel>(i), *session_logger_));
}
@ -633,6 +633,13 @@ common::Status InferenceSession::Initialize() {
// handle any subgraphs
ORT_RETURN_IF_ERROR_SESSIONID_(InitializeSubgraphSessions(graph, session_state_));
is_inited_ = true;
// and log telemetry
const Env& env = Env::Default();
env.GetTelemetryProvider().LogSessionCreation(session_id_, model_->IrVersion(), model_->ProducerName(), model_->ProducerVersion(),
model_->Domain(), model_->MainGraph().DomainToVersionMap(), model_->MainGraph().Name(),
model_->MetaData(), event_name_, execution_providers_.GetIds());
LOGS(*session_logger_, INFO) << "Session successfully initialized.";
} catch (const NotImplementedException& ex) {
status = ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Exception during initialization: ", ex.what());
@ -1134,21 +1141,17 @@ void InferenceSession::AddPredefinedTransformers(GraphTransformerManager& transf
}
};
ORT_ENFORCE(graph_optimization_level < TransformerLevel::MaxTransformerLevel,
"Allowed values are 1 and 2. Current level is set to " +
ORT_ENFORCE(graph_optimization_level <= TransformerLevel::MaxLevel,
"Exceeded max transformer level. Current level is set to " +
std::to_string(static_cast<uint32_t>(graph_optimization_level)));
if ((graph_optimization_level >= TransformerLevel::Level1) || !custom_list.empty()) {
add_transformers(TransformerLevel::Level1);
for (int i = static_cast<int>(TransformerLevel::Level1); i <= static_cast<int>(TransformerLevel::MaxLevel); i++) {
TransformerLevel level = static_cast<TransformerLevel>(i);
if ((graph_optimization_level >= level) || !custom_list.empty()) {
add_transformers(level);
}
}
if ((graph_optimization_level >= TransformerLevel::Level2) || !custom_list.empty()) {
add_transformers(TransformerLevel::Level2);
}
if ((graph_optimization_level >= TransformerLevel::Level3) || !custom_list.empty()) {
add_transformers(TransformerLevel::Level3);
}
}
common::Status InferenceSession::WaitForNotification(Notification* p_executor_done, int64_t timeout_in_ms) {

View file

@ -445,6 +445,7 @@ class InferenceSession {
long long total_run_duration_since_last_; // the total duration (us) of Run() calls since the last report
TimePoint time_sent_last_; // the TimePoint of the last report
const long long kDurationBetweenSending = 1000* 1000 * 60 * 10; // duration in (us). send a report every 10 mins
std::string event_name_; // where the model is loaded from: ["model_loading_uri", "model_loading_proto", "model_loading_istream"]
#ifdef ONNXRUNTIME_ENABLE_INSTRUMENT
bool session_activity_started_ = false;

View file

@ -233,7 +233,7 @@ void CreateSequenceOfTensors(AllocatorPtr alloc, const std::string& name_input,
throw std::runtime_error("CreateSequenceOfTensors: Input is not a tensor");
}
auto p_tensor = CreateTensor(alloc, name_input, reinterpret_cast<PyArrayObject*>(py_obj));
p_seq_tensors->tensors[i] = std::move(*(p_tensor.release()));
p_seq_tensors->tensors[i] = std::move(*p_tensor);
}
}

View file

@ -84,7 +84,7 @@ const std::vector<float> ComputeGeluWithErf(const std::vector<float>& input_data
return output;
}
static void RunAddGeluFusionTest(
static void RunBiasGeluTest(
const std::vector<float>& input_a_data,
const std::vector<float>& input_b_data,
const std::vector<int64_t>& input_a_dims,
@ -92,20 +92,18 @@ static void RunAddGeluFusionTest(
if (HasCudaEnvironment(0)) {
std::vector<float> output_data = ComputeGeluWithErf(Add_Simple(input_a_data, input_b_data));
OpTester tester("AddGeluFusion", 1, onnxruntime::kMSDomain);
OpTester tester("BiasGelu", 1, onnxruntime::kMSDomain);
const std::vector<int64_t>& output_dims = input_a_dims.size() >= input_b_dims.size() ? input_a_dims : input_b_dims;
tester.AddInput<float>("A", input_a_dims, input_a_data);
tester.AddInput<float>("B", input_b_dims, input_b_data);
tester.AddOutput<float>("C", output_dims, output_data);
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
execution_providers.push_back(DefaultCudaExecutionProvider());
tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
tester.Run();
}
}
TEST(AddGeluFusionTest, Two_One_Dim) {
TEST(BiasGeluTest, Two_One_Dim) {
std::vector<float> input_a_data = {
0.8f, -0.5f, 0.0f, 1.f,
0.5f, 0.2f, 0.3f, -0.6f};
@ -113,19 +111,7 @@ TEST(AddGeluFusionTest, Two_One_Dim) {
std::vector<float> input_b_data = {
-0.5f, 0.6f, 1.2f, 2.1f};
RunAddGeluFusionTest(input_a_data, input_b_data, {2, 4}, {4});
}
TEST(AddGeluFusionTest, Two_Two_Dim) {
std::vector<float> input_a_data = {
0.8f, -0.5f, 0.0f, 1.f,
0.5f, 0.2f, 0.3f, -0.6f};
std::vector<float> input_b_data = {
-0.5f, 0.6f, 1.2f, 2.1f,
0.4f, 0.6f, 0.2f, -0.4f};
RunAddGeluFusionTest(input_a_data, input_b_data, {2, 4}, {2, 4});
RunBiasGeluTest(input_a_data, input_b_data, {2, 4}, {4});
}
} // namespace test

View file

@ -89,7 +89,7 @@ class LayerNormOpTester : public OpTester {
}
void Run() {
#ifndef NDEBUG
run_called_ = true;
run_called_ = true;
#endif
std::vector<MLValue> cpu_fetches;
std::vector<MLValue> cuda_fetches;
@ -110,7 +110,7 @@ class LayerNormOpTester : public OpTester {
// Compare GPU with original subgraph
if (DefaultCudaExecutionProvider()) {
ASSERT_TRUE(cuda_fetches.size() == subgraph_fetches.size());
for(size_t i = 0; i < cuda_fetches.size(); i++) {
for (size_t i = 0; i < cuda_fetches.size(); i++) {
if (cuda_fetches[i].IsTensor() && subgraph_fetches[i].IsTensor()) {
VLOGS_DEFAULT(1) << "Checking tensor " << i;
CheckTensor(subgraph_fetches[i].Get<Tensor>(), cuda_fetches[i].Get<Tensor>(), 1e-3, 1e-3);

View file

@ -20,45 +20,45 @@ static void RunTest(
int sequence_length,
int hidden_size,
bool use_float16 = false) {
int min_cuda_architecture = use_float16 ? 530 : 0;
if (HasCudaEnvironment(min_cuda_architecture)) {
// Input and output shapes
// Input 0 - input: (batch_size, sequence_length, hidden_size)
// Input 1 - skip : (batch_size, sequence_length, hidden_size)
// Input 2 - gamma: (hidden_size)
// Input 3 - beta : (hidden_size)
// Output : (batch_size, sequence_length, hidden_size)
std::vector<int64_t> input_dims = {batch_size, sequence_length, hidden_size};
std::vector<int64_t> skip_dims = input_dims;
std::vector<int64_t> gamma_dims = {hidden_size};
std::vector<int64_t> beta_dims = gamma_dims;
std::vector<int64_t> bias_dims = gamma_dims;
std::vector<int64_t> output_dims = input_dims;
if (!use_float16) {
OpTester test("SkipLayerNormalization", 1, onnxruntime::kMSDomain);
test.AddInput<float>("input", input_dims, input_data);
test.AddInput<float>("skip", skip_dims, skip_data);
test.AddInput<float>("gamma", gamma_dims, gamma_data);
test.AddInput<float>("beta", beta_dims, beta_data);
// Input and output shapes
// Input 0 - input: (batch_size, sequence_length, hidden_size)
// Input 1 - skip : (batch_size, sequence_length, hidden_size)
// Input 2 - gamma: (hidden_size)
// Input 3 - beta : (hidden_size)
// Output : (batch_size, sequence_length, hidden_size)
std::vector<int64_t> input_dims = {batch_size, sequence_length, hidden_size};
std::vector<int64_t> skip_dims = input_dims;
std::vector<int64_t> gamma_dims = {hidden_size};
std::vector<int64_t> beta_dims = gamma_dims;
std::vector<int64_t> bias_dims = gamma_dims;
std::vector<int64_t> output_dims = input_dims;
if (use_float16) {
test.AddInput<MLFloat16>("input", input_dims, ToFloat16(input_data));
test.AddInput<MLFloat16>("skip", skip_dims, ToFloat16(skip_data));
test.AddInput<MLFloat16>("gamma", gamma_dims, ToFloat16(gamma_data));
test.AddInput<MLFloat16>("beta", beta_dims, ToFloat16(beta_data));
if (!bias_data.empty()) {
test.AddInput<MLFloat16>("bias", bias_dims, ToFloat16(bias_data));
}
test.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
} else {
test.AddInput<float>("input", input_dims, input_data);
test.AddInput<float>("skip", skip_dims, skip_data);
test.AddInput<float>("gamma", gamma_dims, gamma_data);
test.AddInput<float>("beta", beta_dims, beta_data);
if (!bias_data.empty()) {
test.AddInput<float>("bias", bias_dims, bias_data);
}
test.AddOutput<float>("output", output_dims, output_data);
if (!bias_data.empty()) {
test.AddInput<float>("bias", bias_dims, bias_data);
}
test.AddOutput<float>("output", output_dims, output_data);
test.Run();
} else if (HasCudaEnvironment(530 /*min_cuda_architecture*/)) {
OpTester test("SkipLayerNormalization", 1, onnxruntime::kMSDomain);
test.AddInput<MLFloat16>("input", input_dims, ToFloat16(input_data));
test.AddInput<MLFloat16>("skip", skip_dims, ToFloat16(skip_data));
test.AddInput<MLFloat16>("gamma", gamma_dims, ToFloat16(gamma_data));
test.AddInput<MLFloat16>("beta", beta_dims, ToFloat16(beta_data));
if (!bias_data.empty()) {
test.AddInput<MLFloat16>("bias", bias_dims, ToFloat16(bias_data));
}
test.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
execution_providers.push_back(DefaultCudaExecutionProvider());
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);

View file

@ -1428,7 +1428,7 @@ TEST(InferenceSessionTests, TestCopyToFromDevices) {
TEST(InferenceSessionTests, TestRegisterTransformers) {
string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx";
for (int i = static_cast<int>(TransformerLevel::Default); i < static_cast<int>(TransformerLevel::MaxTransformerLevel); i++) {
for (int i = static_cast<int>(TransformerLevel::Default); i <= static_cast<int>(TransformerLevel::MaxLevel); i++) {
SessionOptions so;
so.session_logid = "InferenceSessionTests.TestL1AndL2Transformers";
so.graph_optimization_level = static_cast<TransformerLevel>(i);

View file

@ -14,10 +14,12 @@
#include "core/optimizer/conv_activation_fusion.h"
#include "core/optimizer/dropout_elimination.h"
#include "core/optimizer/gemm_activation_fusion.h"
#include "core/optimizer/add_gelu_fusion.h"
#include "core/optimizer/bias_gelu_fusion.h"
#include "core/optimizer/gelu_fusion.h"
#include "core/optimizer/gelu_approximation.h"
#include "core/optimizer/layer_norm_fusion.h"
#include "core/optimizer/skip_layer_norm_fusion.h"
#include "core/optimizer/embed_layer_norm_fusion.h"
#include "core/optimizer/graph_transformer.h"
#include "core/optimizer/graph_transformer_mgr.h"
#include "core/optimizer/identity_elimination.h"
@ -1081,15 +1083,15 @@ TEST(GraphTransformationTests, GeluFusionTest) {
ASSERT_TRUE(op_to_count["Gelu"] == 1);
}
TEST(GraphTransformationTests, AddGeluFusionTest) {
auto model_uri = MODEL_FOLDER "fusion/add_gelu_fusion.onnx";
TEST(GraphTransformationTests, BiasGeluTest) {
auto model_uri = MODEL_FOLDER "fusion/bias_gelu_fusion.onnx";
std::shared_ptr<Model> p_model;
ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
Graph& graph = p_model->MainGraph();
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
graph_transformation_mgr.Register(onnxruntime::make_unique<GeluFusion>(), TransformerLevel::Level2);
graph_transformation_mgr.Register(onnxruntime::make_unique<AddGeluFusion>(), TransformerLevel::Level2);
graph_transformation_mgr.Register(onnxruntime::make_unique<BiasGelu>(), TransformerLevel::Level2);
auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger());
ASSERT_TRUE(ret.IsOK());
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
@ -1098,7 +1100,59 @@ TEST(GraphTransformationTests, AddGeluFusionTest) {
ASSERT_TRUE(op_to_count["Erf"] == 0);
ASSERT_TRUE(op_to_count["Mul"] == 0);
ASSERT_TRUE(op_to_count["Gelu"] == 0);
ASSERT_TRUE(op_to_count["GeluFusion"] == 0);
ASSERT_TRUE(op_to_count["BiasGelu"] == 1);
}
// Test Gelu -> FastGelu
TEST(GraphTransformationTests, GeluApproximation_Gelu) {
auto model_uri = MODEL_FOLDER "approximation/gelu.onnx";
std::shared_ptr<Model> p_model;
ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
Graph& graph = p_model->MainGraph();
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
graph_transformation_mgr.Register(onnxruntime::make_unique<GeluApproximation>(), TransformerLevel::Level2);
auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger());
ASSERT_TRUE(ret.IsOK());
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
EXPECT_EQ(op_to_count["Gelu"], 0);
EXPECT_EQ(op_to_count["FastGelu"], 1);
}
// Test AddGeluFusion -> FastGelu
TEST(GraphTransformationTests, GeluApproximation_Gelu_Add_Bias) {
auto model_uri = MODEL_FOLDER "approximation/gelu_add_bias.onnx";
std::shared_ptr<Model> p_model;
ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
Graph& graph = p_model->MainGraph();
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
graph_transformation_mgr.Register(onnxruntime::make_unique<GeluApproximation>(), TransformerLevel::Level2);
auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger());
ASSERT_TRUE(ret.IsOK());
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
EXPECT_EQ(op_to_count["BiasGelu"], 0);
EXPECT_EQ(op_to_count["FastGelu"], 1);
}
// Test MatMul & AddGeluFusion -> MatMul & FastGelu
TEST(GraphTransformationTests, GeluApproximation_Gelu_Add_MatMul) {
auto model_uri = MODEL_FOLDER "approximation/gelu_add_matmul.onnx";
std::shared_ptr<Model> p_model;
ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
Graph& graph = p_model->MainGraph();
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
graph_transformation_mgr.Register(onnxruntime::make_unique<GeluApproximation>(), TransformerLevel::Level2);
auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger());
ASSERT_TRUE(ret.IsOK());
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
EXPECT_EQ(op_to_count["BiasGelu"], 0);
EXPECT_EQ(op_to_count["MatMul"], 1);
EXPECT_EQ(op_to_count["FastGelu"], 1);
}
TEST(GraphTransformationTests, LayerNormFusionTest) {
@ -1200,6 +1254,48 @@ TEST(GraphTransformationTests, SkipLayerNormFusionTest) {
TestSkipLayerNormFusion(MODEL_FOLDER "fusion/skip_layer_norm_format2.onnx");
TestSkipLayerNormFusion(MODEL_FOLDER "fusion/skip_layer_norm_format3.onnx");
}
TEST(GraphTransformationTests, EmbedLayerNormFusionFormat1) {
auto model_uri = MODEL_FOLDER "fusion/embed_layer_norm_format1.onnx";
std::shared_ptr<Model> p_model;
ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
Graph& graph = p_model->MainGraph();
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
graph_transformation_mgr.Register(onnxruntime::make_unique<EmbedLayerNormFusion>(), TransformerLevel::Level2);
auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger());
ASSERT_TRUE(ret.IsOK());
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
ASSERT_TRUE(op_to_count["Gather"] == 0);
ASSERT_TRUE(op_to_count["Add"] == 0);
ASSERT_TRUE(op_to_count["ReduceSum"] == 0);
ASSERT_TRUE(op_to_count["Attention"] == 1);
ASSERT_TRUE(op_to_count["SkipLayerNormalization"] == 0);
ASSERT_TRUE(op_to_count["EmbedLayerNormalization"] == 1);
}
TEST(GraphTransformationTests, EmbedLayerNormFusionFormat2) {
auto model_uri = MODEL_FOLDER "fusion/embed_layer_norm_format2.onnx";
std::shared_ptr<Model> p_model;
ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
Graph& graph = p_model->MainGraph();
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
graph_transformation_mgr.Register(onnxruntime::make_unique<EmbedLayerNormFusion>(), TransformerLevel::Level2);
auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger());
ASSERT_TRUE(ret.IsOK());
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
ASSERT_TRUE(op_to_count["Shape"] == 0);
ASSERT_TRUE(op_to_count["Expand"] == 0);
ASSERT_TRUE(op_to_count["Gather"] == 0);
ASSERT_TRUE(op_to_count["Add"] == 0);
ASSERT_TRUE(op_to_count["ReduceSum"] == 0);
ASSERT_TRUE(op_to_count["Attention"] == 1);
ASSERT_TRUE(op_to_count["SkipLayerNormalization"] == 0);
ASSERT_TRUE(op_to_count["EmbedLayerNormalization"] == 1);
}
#endif
} // namespace test

View file

@ -51,7 +51,7 @@ TEST(GraphTransformerUtilsTests, TestGenerateGraphTransformers) {
}
}
ASSERT_TRUE(rule_transformer && rule_transformer->RulesCount() == 1);
transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level2, {}, custom_list);
#ifndef DISABLE_CONTRIB_OPS
ASSERT_TRUE(transformers.size() == 1);

View file

@ -32,7 +32,7 @@ Options:
-v: Show verbose information.
-x: [intra_op_num_threads]: Sets the number of threads used to parallelize the execution within nodes. A value of 0 means the test will auto-select a default. Must >=0.
-x: [intra_op_num_threads]: Sets the number of threads used to parallelize the execution within nodes. A value of 0 means the test will auto-select a default. Must >=0. If OpenMP is enabled, this configuration will be ignored.
-y: [inter_op_num_threads]: Sets the number of threads used to parallelize the execution of the graph (across nodes), A value of 0 means the test will auto-select a default. Must >=0.

View file

@ -41,7 +41,7 @@ namespace perftest {
"\t-p [profile_file]: Specifies the profile name to enable profiling and dump the profile data to the file.\n"
"\t-s: Show statistics result, like P75, P90.\n"
"\t-v: Show verbose information.\n"
"\t-x [intra_op_num_threads]: Sets the number of threads used to parallelize the execution within nodes, A value of 0 means ORT will pick a default. Must >=0.\n"
"\t-x [intra_op_num_threads]: Sets the number of threads used to parallelize the execution within nodes, A value of 0 means ORT will pick a default. Must >=0. If OpenMP is enabled, this configuration will be ignored.\n"
"\t-y [inter_op_num_threads]: Sets the number of threads used to parallelize the execution of the graph (across nodes), A value of 0 means ORT will pick a default. Must >=0.\n"
"\t-P: Use parallel executor instead of sequential executor.\n"
"\t-o [optimization level]: Default is 1. Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).\n"
@ -123,10 +123,15 @@ namespace perftest {
test_config.run_config.f_verbose = true;
break;
case 'x':
#ifdef USE_OPENMP
fprintf(stderr, "cannot use argument '-x' when OpenMP is enabled.\n");
return false;
#else
test_config.run_config.intra_op_num_threads = static_cast<int>(OrtStrtol<PATH_CHAR_TYPE>(optarg, nullptr));
if (test_config.run_config.intra_op_num_threads < 0) {
return false;
}
#endif
break;
case 'y':
test_config.run_config.inter_op_num_threads = static_cast<int>(OrtStrtol<PATH_CHAR_TYPE>(optarg, nullptr));

View file

@ -82,7 +82,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
} else if (provider_name == onnxruntime::kAclExecutionProvider) {
#ifdef USE_ACL
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ACL(session_options,
performance_test_config.run_config.enable_cpu_mem_arena ? 1 : 0));
performance_test_config.run_config.enable_cpu_mem_arena ? 1 : 0));
#else
ORT_THROW("Acl is not supported in this build\n");
#endif
@ -100,8 +100,11 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
else
session_options.DisableMemPattern();
session_options.SetExecutionMode(performance_test_config.run_config.execution_mode);
#ifndef USE_OPENMP
fprintf(stdout, "Setting intra_op_num_threads to %d\n", performance_test_config.run_config.intra_op_num_threads);
session_options.SetIntraOpNumThreads(performance_test_config.run_config.intra_op_num_threads);
#endif
if (performance_test_config.run_config.execution_mode == ExecutionMode::ORT_PARALLEL) {
fprintf(stdout, "Setting inter_op_num_threads to %d\n", performance_test_config.run_config.inter_op_num_threads);

View file

@ -0,0 +1,101 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/platform/threadpool.h"
#include <core/common/make_unique.h>
#include "gtest/gtest.h"
#include <algorithm>
#include <memory>
#include <functional>
#include <mutex>
using namespace onnxruntime::concurrency;
namespace {
struct TestData {
explicit TestData(int num) : data(num, 0) {}
std::vector<int> data;
std::mutex mutex;
};
// This unittest tests ThreadPool function by counting the number of calls to function with each index.
// the function should be called exactly once for each element.
std::unique_ptr<TestData> CreateTestData(int num) {
return onnxruntime::make_unique<TestData>(num);
}
void IncrementElement(TestData& test_data, int i) {
std::lock_guard<std::mutex> lock(test_data.mutex);
test_data.data[i]++;
}
void ValidateTestData(TestData& test_data) {
ASSERT_TRUE(std::count_if(test_data.data.cbegin(),
test_data.data.cend(),
[](int i) { return i != 1; }) == 0);
}
void CreateThreadPoolAndTest(const std::string& name, int num_threads, const std::function<void(ThreadPool*)>& test_body) {
auto tp = onnxruntime::make_unique<ThreadPool>(name, num_threads);
test_body(tp.get());
}
void TestParallelFor(const std::string& name, int num_threads, int num_tasks) {
auto test_data = CreateTestData(num_tasks);
CreateThreadPoolAndTest(name, num_threads, [&](ThreadPool* tp) {
tp->ParallelFor(num_tasks, [&](int i) {
IncrementElement(*test_data, i);
});
});
ValidateTestData(*test_data);
}
void TestBatchParallelFor(const std::string& name, int num_threads, int num_tasks, int batch_size) {
auto test_data = CreateTestData(num_tasks);
CreateThreadPoolAndTest(name, num_threads, [&](ThreadPool* tp) {
tp->BatchParallelFor(
num_tasks, [&](int i) {
IncrementElement(*test_data, i);
},
batch_size);
});
ValidateTestData(*test_data);
}
} // namespace
TEST(ThreadPoolTest, TestParallelFor_2_Thread_NoTask) {
TestParallelFor("TestParallelFor_2_Thread_NoTask", 2, 0);
}
TEST(ThreadPoolTest, TestParallelFor_2_Thread_50_Task) {
TestParallelFor("TestParallelFor_2_Thread_50_Task", 2, 50);
}
TEST(ThreadPoolTest, TestParallelFor_1_Thread_50_Task) {
TestParallelFor("TestParallelFor_1_Thread_50_Task", 1, 50);
}
TEST(ThreadPoolTest, TestBatchParallelFor_2_Thread_50_Task_10_Batch) {
TestBatchParallelFor("TestBatchParallelFor_2_Thread_50_Task_10_Batch", 2, 50, 10);
}
TEST(ThreadPoolTest, TestBatchParallelFor_2_Thread_50_Task_0_Batch) {
TestBatchParallelFor("TestBatchParallelFor_2_Thread_50_Task_0_Batch", 2, 50, 0);
}
TEST(ThreadPoolTest, TestBatchParallelFor_2_Thread_50_Task_1_Batch) {
TestBatchParallelFor("TestBatchParallelFor_2_Thread_50_Task_1_Batch", 2, 50, 1);
}
TEST(ThreadPoolTest, TestBatchParallelFor_2_Thread_50_Task_100_Batch) {
TestBatchParallelFor("TestBatchParallelFor_2_Thread_50_Task_100_Batch", 2, 50, 100);
}
TEST(ThreadPoolTest, TestBatchParallelFor_2_Thread_81_Task_20_Batch) {
TestBatchParallelFor("TestBatchParallelFor_2_Thread_81_Task_20_Batch", 2, 81, 20);
}

View file

@ -61,7 +61,7 @@ class TestInferenceSessionKeras(unittest.TestCase):
# runtime
content = converted_model.SerializeToString()
rt = onnxrt.InferenceSession(content)
input = {'conv2d_1_input_0': x}
input = {rt.get_inputs()[0].name: x}
actual_rt = rt.run(None, input)
self.assertEqual(len(actual_rt), 1)
np.testing.assert_allclose(actual, actual_rt[0], rtol=1e-05, atol=1e-08)

View file

@ -0,0 +1,13 @@
:~
#
ACGelu_1"Gelu: com.microsoft Gelu_NoBiasZ$
A

batch
seq_len
b$
C

batch
seq_len
B

View file

@ -0,0 +1,17 @@

3
A
BCAddGeluFusion_1"BiasGelu: com.microsoft Gelu_AddBiasZ$
A

batch
seq_len
Z
B

b$
C

batch
seq_len
B

View file

@ -0,0 +1,25 @@


A
BCMatMul_1"MatMul
3
C
DEAddGeluFusion_1"BiasGelu: com.microsoftMatMul_AddGeluFusionZ$
A

batch
seq_len
xZ
B


Z
D

b$
E

batch
seq_len
B

View file

@ -0,0 +1,65 @@
import onnx
from onnx import helper
from onnx import TensorProto
graph = helper.make_graph(
[ # nodes
# Add node before Gelu
helper.make_node("Gelu", ["A"], ["C"], "Gelu_1", domain="com.microsoft"),
],
"Gelu_NoBias", #name
[ # inputs
helper.make_tensor_value_info('A', TensorProto.FLOAT, ['batch', 'seq_len', 3072]),
],
[ # outputs
helper.make_tensor_value_info('C', TensorProto.FLOAT, ['batch', 'seq_len', 3072]),
],
[ # initializers
]
)
model = helper.make_model(graph)
onnx.save(model, r'gelu.onnx')
graph = helper.make_graph(
[ # nodes
# Add node before Gelu
helper.make_node("BiasGelu", ["A", "B"], ["C"], "AddGeluFusion_1", domain="com.microsoft"),
],
"Gelu_AddBias", #name
[ # inputs
helper.make_tensor_value_info('A', TensorProto.FLOAT, ['batch', 'seq_len', 3072]),
helper.make_tensor_value_info('B', TensorProto.FLOAT, [3072]),
],
[ # outputs
helper.make_tensor_value_info('C', TensorProto.FLOAT, ['batch', 'seq_len', 3072]),
],
[ # initializers
]
)
model = helper.make_model(graph)
onnx.save(model, r'gelu_add_bias.onnx')
graph = helper.make_graph(
[ # nodes
# Add node before Gelu
helper.make_node("MatMul", ["A", "B"], ["C"], "MatMul_1"),
helper.make_node("BiasGelu", ["C", "D"], ["E"], "AddGeluFusion_1", domain="com.microsoft"),
],
"MatMul_AddGeluFusion", #name
[ # inputs
helper.make_tensor_value_info('A', TensorProto.FLOAT, ['batch', 'seq_len', 'x']),
helper.make_tensor_value_info('B', TensorProto.FLOAT, [128, 3072]),
helper.make_tensor_value_info('D', TensorProto.FLOAT, [3072]),
],
[ # outputs
helper.make_tensor_value_info('E', TensorProto.FLOAT, ['batch', 'seq_len', 3072]),
],
[ # initializers
]
)
model = helper.make_model(graph)
onnx.save(model, r'gelu_add_matmul.onnx')

View file

@ -17,7 +17,7 @@ graph = helper.make_graph(
"Gelu_Add_Fusion", #name
[ # inputs
helper.make_tensor_value_info('A', TensorProto.FLOAT, ['unk_1', 'unk_2', 3072]),
helper.make_tensor_value_info('B', TensorProto.FLOAT, ['unk_1', 'unk_2', 3072]),
helper.make_tensor_value_info('B', TensorProto.FLOAT, [3072]),
],
[ # outputs
helper.make_tensor_value_info('C', TensorProto.FLOAT, ['unk_3', 'unk_4', 3072]),
@ -30,4 +30,4 @@ graph = helper.make_graph(
)
model = helper.make_model(graph)
onnx.save(model, r'add_gelu_fusion.onnx')
onnx.save(model, r'bias_gelu_fusion.onnx')

View file

@ -110,7 +110,9 @@ try:
logger.info('removing %s', file)
remove(file)
except ImportError:
except ImportError as error:
print("Error importing dependencies:")
print(error)
bdist_wheel = None
# Additional binaries
@ -182,6 +184,11 @@ if nightly_build:
date_suffix = str(datetime.datetime.now().date().strftime("%m%d"))
version_number = version_number + ".dev" + date_suffix
cmd_classes = {}
if bdist_wheel is not None :
cmd_classes['bdist_wheel'] = bdist_wheel
cmd_classes['build_ext'] = build_ext
# Setup
setup(
name=package_name,
@ -190,7 +197,7 @@ setup(
long_description=long_description,
author='Microsoft Corporation',
author_email='onnx@microsoft.com',
cmdclass={'bdist_wheel': bdist_wheel, 'build_ext': build_ext},
cmdclass=cmd_classes,
license="MIT License",
packages=['onnxruntime',
'onnxruntime.backend',

View file

@ -14,7 +14,6 @@ import re
import shutil
import subprocess
import sys
import warnings
import hashlib
from os.path import expanduser
@ -60,6 +59,7 @@ Use the individual flags to only run the specified stages.
If you've done an update that fetched external dependencies you have to build without --parallel the first time.
Once that's done, run with "--build --parallel --test" to just build in parallel and run tests.''')
parser.add_argument("--test", action='store_true', help="Run unit tests.")
parser.add_argument("--skip_tests", action='store_true', help="Skip all tests.")
# enable ONNX tests
parser.add_argument("--enable_onnx_tests", action='store_true',
@ -150,7 +150,7 @@ Use the individual flags to only run the specified stages.
parser.add_argument("--tensorrt_home", help="Path to TensorRT installation dir")
parser.add_argument("--use_full_protobuf", action='store_true', help="Use the full protobuf library")
parser.add_argument("--disable_contrib_ops", action='store_true', help="Disable contrib ops (reduces binary size)")
parser.add_argument("--skip_onnx_tests", action='store_true', help="Explicitly disable all onnx related tests")
parser.add_argument("--skip_onnx_tests", action='store_true', help="Explicitly disable all onnx related tests. Note: Use --skip_tests to skip all tests.")
parser.add_argument("--skip_winml_tests", action='store_true', help="Explicitly disable all WinML related tests")
parser.add_argument("--enable_msvc_static_runtime", action='store_true', help="Enable static linking of MSVC runtimes.")
parser.add_argument("--enable_language_interop_ops", action='store_true', help="Enable operator implemented in language other than cpp")
@ -193,7 +193,9 @@ def run_subprocess(args, cwd=None, capture=False, dll_path=None, shell=False, en
stdout, stderr = (subprocess.PIPE, subprocess.STDOUT) if capture else (None, None)
my_env.update(env)
return subprocess.run(args, cwd=cwd, check=True, stdout=stdout, stderr=stderr, env=my_env, shell=shell)
completed_process = subprocess.run(args, cwd=cwd, check=True, stdout=stdout, stderr=stderr, env=my_env, shell=shell)
log.debug("Subprocess completed. Return code=" + str(completed_process.returncode))
return completed_process
def update_submodules(source_dir):
run_subprocess(["git", "submodule", "sync", "--recursive"], cwd=source_dir)
@ -555,26 +557,33 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs, enab
return
if is_windows():
cwd = os.path.join(cwd, config)
run_subprocess([sys.executable, 'onnxruntime_test_python.py'], cwd=cwd, dll_path=dll_path)
try:
import onnx
import scipy # gen_test_models.py used by onnx_test has a dependency on scipy
onnx_test = True
except ImportError:
warnings.warn("onnx is not installed. Following test cannot be run.")
except ImportError as error:
log.exception(error)
log.warning("onnx or scipy is not installed. The ONNX tests will be skipped.")
onnx_test = False
if onnx_test:
run_subprocess([sys.executable, 'onnxruntime_test_python_backend.py'], cwd=cwd, dll_path=dll_path)
run_subprocess([sys.executable, os.path.join(source_dir,'onnxruntime','test','onnx','gen_test_models.py'),'--output_dir','test_models'], cwd=cwd)
run_subprocess([sys.executable, os.path.join(source_dir,'onnxruntime','test','onnx','gen_test_models.py'),
'--output_dir','test_models'], cwd=cwd)
run_subprocess([os.path.join(cwd,'onnx_test_runner'), 'test_models'], cwd=cwd)
if config != 'Debug':
run_subprocess([sys.executable, 'onnx_backend_test_series.py'], cwd=cwd, dll_path=dll_path)
if not args.skip_keras_test:
try:
import onnxmltools
import keras
onnxml_test = True
except ImportError:
warnings.warn("onnxmltools and keras are not installed. Following test cannot be run.")
log.warning("onnxmltools and keras are not installed. The keras tests will be skipped.")
onnxml_test = False
if onnxml_test:
run_subprocess([sys.executable, 'onnxruntime_test_python_keras.py'], cwd=cwd, dll_path=dll_path)
@ -844,6 +853,9 @@ def main():
else:
args.test = True
if args.skip_tests:
args.test = False
if args.use_tensorrt:
args.use_cuda = True