From 650fb8754b03ae89f6b0d456370ecb2dc901f4bd Mon Sep 17 00:00:00 2001 From: Yang Chen <40417152+yangchen-MS@users.noreply.github.com> Date: Thu, 26 Sep 2019 16:29:30 -0700 Subject: [PATCH] use MLAS for nuphar's pool ops (#1937) * call MLAS's pooling function as an external call for Nuphar Note that at the moment Nuphar provider doesn't handle the cases below: - symbolic height/weight dimensions - Indices output of MaxPool - non-default dilations * unify the pool interface for mti and mti_x86 --- onnxruntime/core/codegen/mti/nn/pool_ops.cc | 73 ++++---- onnxruntime/core/codegen/mti/nn/pool_ops.h | 28 +-- .../passes/op_ir_creator/nn/pool_ops.cc | 100 +++------- .../nuphar/compiler/codegen_manager.cc | 5 +- .../nuphar/compiler/nuphar_op_ir_builder.cc | 1 - .../compiler/x86/op_ir_creator/all_ops.h | 9 + .../x86/op_ir_creator/math/unary_ops.cc | 1 - .../compiler/x86/op_ir_creator/nn/pool_ops.cc | 83 +++++++++ onnxruntime/core/providers/nuphar/kernel.h | 12 +- .../providers/nuphar/mti_x86/nn/pool_ops.cc | 176 ++++++++++++++++++ .../providers/nuphar/mti_x86/nn/pool_ops.h | 37 ++++ .../nuphar/nuphar_execution_provider.cc | 17 ++ 12 files changed, 417 insertions(+), 125 deletions(-) create mode 100644 onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/nn/pool_ops.cc create mode 100644 onnxruntime/core/providers/nuphar/mti_x86/nn/pool_ops.cc create mode 100644 onnxruntime/core/providers/nuphar/mti_x86/nn/pool_ops.h diff --git a/onnxruntime/core/codegen/mti/nn/pool_ops.cc b/onnxruntime/core/codegen/mti/nn/pool_ops.cc index 5af944186c..868a14748c 100644 --- a/onnxruntime/core/codegen/mti/nn/pool_ops.cc +++ b/onnxruntime/core/codegen/mti/nn/pool_ops.cc @@ -3,6 +3,9 @@ #include "core/codegen/mti/nn/pool_ops.h" +#include "core/codegen/mti/mti_tvm_utils.h" +#include "core/mlas/inc/mlas.h" +#include "core/providers/cpu/nn/pool_attributes.h" #include namespace onnxruntime { @@ -10,48 +13,50 @@ namespace tvm_codegen { // TODO: topi only support 2d-pool, MaxPool1d and MaxPool3d will need to be added if necessary. // only support version < 8 for topi doesn't come with implementation to output index tensor -tvm::Tensor MaxPool( - const tvm::Tensor& input, - const tvm::Array& kernel_size, - const tvm::Array& stride_size, - const tvm::Array& padding_size, - const std::string& layout, - bool count_include_pad) { - return topi::nn::pool(input, kernel_size, stride_size, padding_size, - topi::nn::kMaxPool, - false, - layout, - count_include_pad); +tvm::Tensor MaxPool(const tvm::Tensor& input, + const PoolAttributes& pool_attrs, + const tvm::Array& /*output_shape*/, + const std::string& /*name*/) { + return topi::nn::pool(input, + ToTvmArray(pool_attrs.kernel_shape), + ToTvmArray(pool_attrs.strides), + ToTvmArray(pool_attrs.pads), + /*pool_type*/ topi::nn::kMaxPool, + /*ceil_mode*/ false, + /*layout*/ pool_attrs.storage_order == 0 ? "NCWH" : "NCHW", + pool_attrs.count_include_pad); } -tvm::Tensor AveragePool( - const tvm::Tensor& input, - const tvm::Array& kernel_size, - const tvm::Array& stride_size, - const tvm::Array& padding_size, - const std::string& layout, - bool count_include_pad) { - return topi::nn::pool(input, kernel_size, stride_size, padding_size, - topi::nn::kAvgPool, - false, - layout, - count_include_pad); +tvm::Tensor AveragePool(const tvm::Tensor& input, + const PoolAttributes& pool_attrs, + const tvm::Array& /*output_shape*/, + const std::string& /*name*/) { + return topi::nn::pool(input, + ToTvmArray(pool_attrs.kernel_shape), + ToTvmArray(pool_attrs.strides), + ToTvmArray(pool_attrs.pads), + /*pool_type*/ topi::nn::kAvgPool, + /*ceil_mode*/ false, + /*layout*/ "NCHW", + pool_attrs.count_include_pad); } -tvm::Tensor GlobalMaxPool( - const tvm::Tensor& input, - const std::string& layout) { +tvm::Tensor GlobalMaxPool(const tvm::Tensor& input, + const PoolAttributes& /*pool_attrs*/, + const tvm::Array& /*output_shape*/, + const std::string& /*name*/) { return topi::nn::global_pool(input, - topi::nn::kMaxPool, - layout); + /*pool_type*/ topi::nn::kMaxPool, + /*layout*/ "NCHW"); } -tvm::Tensor GlobalAveragePool( - const tvm::Tensor& input, - const std::string& layout) { +tvm::Tensor GlobalAveragePool(const tvm::Tensor& input, + const PoolAttributes& /*pool_attrs*/, + const tvm::Array& /*output_shape*/, + const std::string& /*name*/) { return topi::nn::global_pool(input, - topi::nn::kAvgPool, - layout); + /*pool_type*/ topi::nn::kAvgPool, + /*layout*/ "NCHW"); } } // namespace tvm_codegen diff --git a/onnxruntime/core/codegen/mti/nn/pool_ops.h b/onnxruntime/core/codegen/mti/nn/pool_ops.h index 23fbda913e..d381f9ddff 100644 --- a/onnxruntime/core/codegen/mti/nn/pool_ops.h +++ b/onnxruntime/core/codegen/mti/nn/pool_ops.h @@ -6,27 +6,31 @@ #include namespace onnxruntime { + +// Forward declaration +struct PoolAttributes; + namespace tvm_codegen { tvm::Tensor MaxPool(const tvm::Tensor& input, - const tvm::Array& kernel_size, - const tvm::Array& stride_size, - const tvm::Array& padding_size, - const std::string& layout, - bool count_include_pad); + const PoolAttributes& pool_attrs, + const tvm::Array& output_shape, + const std::string& name = "max_pool"); tvm::Tensor AveragePool(const tvm::Tensor& input, - const tvm::Array& kernel_size, - const tvm::Array& stride_size, - const tvm::Array& padding_size, - const std::string& layout, - bool count_include_pad); + const PoolAttributes& pool_attrs, + const tvm::Array& output_shape, + const std::string& name = "average_pool"); tvm::Tensor GlobalMaxPool(const tvm::Tensor& input, - const std::string& layout); + const PoolAttributes& pool_attrs, + const tvm::Array& output_shape, + const std::string& name = "global_max_pool"); tvm::Tensor GlobalAveragePool(const tvm::Tensor& input, - const std::string& layout); + const PoolAttributes& pool_attrs, + const tvm::Array& output_shape, + const std::string& name = "global_average_pool"); } // namespace tvm_codegen } // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/nn/pool_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/nn/pool_ops.cc index 556d175a96..84d3b7c1e0 100644 --- a/onnxruntime/core/codegen/passes/op_ir_creator/nn/pool_ops.cc +++ b/onnxruntime/core/codegen/passes/op_ir_creator/nn/pool_ops.cc @@ -6,86 +6,44 @@ #include "core/codegen/mti/mti_tvm_utils.h" #include "core/codegen/mti/nn/pool_ops.h" #include "core/framework/op_kernel_info.h" +#include "core/providers/cpu/nn/pool_attributes.h" namespace onnxruntime { namespace tvm_codegen { -// helper class for pool_ops with arguments -class FuncWithPoolingArgument { - public: - FuncWithPoolingArgument(const Node& node, const std::string& op_name) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper info(&ctx); - int64_t storage_order{0}; // MaxPool_8 only. 0 is row major, and 1 is column major. Default is 0. - - ORT_ENFORCE(info.GetAttrs("kernel_shape", kernel_shape_).IsOK(), "No kernel shape is set."); - if (kernel_shape_.size() != 2) - ORT_NOT_IMPLEMENTED(kernel_shape_.size(), "d pooling is not implementated"); - if (!info.GetAttrs("pads", pads_).IsOK() || pads_.empty()) { - pads_.resize(kernel_shape_.size() * 2, 0); - } - if (!info.GetAttrs("strides", strides_).IsOK() || strides_.empty()) { - strides_.resize(kernel_shape_.size(), 1); - } - if (op_name == "AveragePool") { - int64_t temp; - ORT_ENFORCE(info.GetAttr("count_include_pad", &temp).IsOK()); - count_include_pad_ = (temp != 0); - } - - if (op_name == "MaxPool") { - // TODO: add version check or not? remove version check since only after version 8 would have storage_order, otherwise, it would be zero - storage_order = info.GetAttrOrDefault("storage_order", 0 /*default_value*/); - if (storage_order != 1) { - layout_ = "NCWH"; - } - } - } - - std::vector kernel_shape_; - std::vector pads_; - std::vector strides_; - std::string layout_ = "NCHW"; - bool count_include_pad_ = false; -}; - // A local macro to create Pool Ops // helper macro defines Evaluate of of POOL_OP OpIRCreators -#define POOL_OP(name) \ - Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ - const tvm::Array& inputs, \ - const Node& node, \ - CodeGenContext&, \ - tvm::Array& outputs) { \ - if (outputs.size() > 1) ORT_NOT_IMPLEMENTED("output size = 2 is not implementated"); \ - FuncWithPoolingArgument argment(node, #name); \ - tvm::Tensor Y = name(inputs[0], ToTvmArray(argment.kernel_shape_), ToTvmArray(argment.strides_), ToTvmArray(argment.pads_), argment.layout_, argment.count_include_pad_); \ - outputs.push_back(Y); \ - return Status::OK(); \ - } // namespace tvm_codegen - -POOL_OP(MaxPool) -POOL_OP(AveragePool) - -#undef POOL_OP - -// helper macro defines Evaluate of of GlobalPOOL_OP OpIRCreators -#define POOL_OP(name) \ - Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ - const tvm::Array& inputs, \ - const Node& node, \ - CodeGenContext&, \ - tvm::Array& outputs) { \ - if (inputs[0]->shape.size() != 4) \ - ORT_NOT_IMPLEMENTED(gsl::narrow_cast(inputs[0]->shape.size()) - 2, "d global pooling is not implementated"); \ - tvm::Tensor Y = name(inputs[0], "NCHW"); \ - outputs.push_back(Y); \ - return Status::OK(); \ +#define POOL_OP(name) \ + Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ + const tvm::Array& inputs, \ + const Node& node, \ + CodeGenContext& ctx_codegen, \ + tvm::Array& outputs) { \ + ORT_RETURN_IF_NOT(outputs.size() == 1, "multiple outputs are not supported yet!"); \ + ProtoHelperNodeContext ctx(node); \ + OpNodeProtoHelper info(&ctx); \ + int version = ctx_codegen.GetCodeGenHandle()->domain_version_lookup_func(node.Domain()); \ + PoolAttributes pool_attrs(info, #name, version); \ + for (auto n : pool_attrs.dilations) { \ + ORT_RETURN_IF_NOT(n <= 1, "dilations are not supported yet!"); \ + } \ + if (pool_attrs.global_pooling) { \ + if (inputs[0]->shape.size() != 4) { \ + ORT_NOT_IMPLEMENTED(gsl::narrow_cast(inputs[0]->shape.size()) - 2, "d global pooling is not implementated"); \ + } \ + } else { \ + if (pool_attrs.kernel_shape.size() != 2) { \ + ORT_NOT_IMPLEMENTED(pool_attrs.kernel_shape.size(), "d pooling is not implementated"); \ + } \ + } \ + tvm::Array dummy_output_shape; \ + tvm::Tensor Y = name(inputs[0], pool_attrs, dummy_output_shape); \ + outputs.push_back(Y); \ + return Status::OK(); \ } -POOL_OP(GlobalMaxPool) -POOL_OP(GlobalAveragePool) +LIST_POOL_OPS() #undef POOL_OP diff --git a/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc b/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc index 981e14d2be..76458213e0 100644 --- a/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc +++ b/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc @@ -3,7 +3,6 @@ #include "core/providers/nuphar/compiler/codegen_manager.h" -#include "core/codegen/common/op_macro.h" #include "core/codegen/passes/op_ir_creator/all_ops.h" #include "core/codegen/passes/scheduler/all_schedules.h" #include "core/codegen/passes/weight_layout/transpose_2d.h" @@ -27,6 +26,7 @@ namespace nuphar { #define ADD_OP_ITEM(name) \ op_ir_registry->Register(std::move(std::make_unique())); +#define POOL_OP(OP) ADD_OP_ITEM(OP) #define REDUCE_V_OP(name) ADD_OP_ITEM(name) #define UNARY_OP(name) ADD_OP_ITEM(name) @@ -35,6 +35,7 @@ static void RegisterAllNupharX86OpIRCreators(tvm_codegen::OpIRRegistry* op_ir_re } #undef ADD_OP_ITEM +#undef POOL_OP #undef REDUCE_V_OP #undef UNARY_OP @@ -117,6 +118,7 @@ static void RegisterAllNupharWeightLayouts(tvm_codegen::WeightLayoutRegistry* la #define ADD_OP_ITEM(name) \ dispatcher->Register(#name, registry->Get(NUPHAR_TVM_X86_OP_IR_CREATOR_STRING(name))); +#define POOL_OP(OP) ADD_OP_ITEM(OP) #define REDUCE_V_OP(name) ADD_OP_ITEM(name) #define UNARY_OP(name) ADD_OP_ITEM(name) @@ -128,6 +130,7 @@ static void RegisterNupharX86Dispatcher(const std::shared_ptr GetOutputShapeAndPads(const Node& node, + PoolAttributes& pool_attrs, + tvm_codegen::CodeGenContext& ctx_codegen) { + const NodeArg* input = node.InputDefs()[0]; + ORT_ENFORCE(input); + const ONNX_NAMESPACE::TensorShapeProto* shape_proto = input->Shape(); + size_t num_input_dims = shape_proto->dim_size(); + ORT_ENFORCE(num_input_dims >= 2); + + tvm::Array output_shape; + // batch dimenion + output_shape.push_back(ShapeDimToTvmDim(shape_proto->dim(0), ctx_codegen)); + // output channel + output_shape.push_back(ShapeDimToTvmDim(shape_proto->dim(1), ctx_codegen)); + + size_t kernel_sz = pool_attrs.kernel_shape.size(); + if (pool_attrs.global_pooling) { + pool_attrs.pads.assign(kernel_sz, 0); + // skip batch and channel dimensions, so dim starts from 2 + for (size_t dim = 2; dim < num_input_dims; dim++) { + output_shape.push_back(tvm::make_const(tvm::Int(32), 1)); + } + } else { + ORT_ENFORCE(num_input_dims > kernel_sz); + size_t kernel_idx_offset = num_input_dims - kernel_sz; + for (size_t dim = 0; dim < kernel_sz; dim++) { + // TODO: handle symbolic dimensions + ORT_ENFORCE(ShapeHasValue(input, dim + kernel_idx_offset)); + int64_t dim_val = ShapeValue(input, dim + kernel_idx_offset); + int64_t dim_size = 0; + pool_attrs.ComputeSizePadDilations(static_cast(dim_val), + pool_attrs.strides[dim], + pool_attrs.kernel_shape[dim], + &(pool_attrs.pads[dim]), + &(pool_attrs.pads[kernel_sz + dim]), + pool_attrs.dilations[dim], + &dim_size); + output_shape.push_back(tvm::make_const(tvm::Int(32), dim_size)); + } + } + return output_shape; +} + +#define POOL_OP(name) \ + Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(name)::Evaluate( \ + const tvm::Array& inputs, \ + const Node& node, \ + tvm_codegen::CodeGenContext& ctx_codegen, \ + tvm::Array& outputs) { \ + ORT_RETURN_IF_NOT(node.OutputDefs().size() == 1, " multiple outputs are not supported yet!"); \ + ORT_RETURN_IF_NOT(inputs[0]->dtype == HalideIR::Float(32), " non-float32 not supported yet"); \ + ProtoHelperNodeContext ctx(node); \ + OpNodeProtoHelper info(&ctx); \ + int version = ctx_codegen.GetCodeGenHandle()->domain_version_lookup_func(node.Domain()); \ + PoolAttributes pool_attrs(info, #name, version); \ + for (auto n : pool_attrs.dilations) { \ + ORT_RETURN_IF_NOT(n <= 1, "dilations are not supported yet!"); \ + } \ + tvm::Array output_shape = GetOutputShapeAndPads(node, pool_attrs, ctx_codegen); \ + tvm::Tensor Y = name(inputs[0], pool_attrs, output_shape); \ + outputs.push_back(Y); \ + return Status::OK(); \ + } \ + +LIST_X86_POOL_OPS() + +#undef POOL_OP + +} // namespace nuphar +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/nuphar/kernel.h b/onnxruntime/core/providers/nuphar/kernel.h index 485e4d733b..0a37b5524f 100644 --- a/onnxruntime/core/providers/nuphar/kernel.h +++ b/onnxruntime/core/providers/nuphar/kernel.h @@ -77,7 +77,8 @@ class NupharKernelState { NUPHAR_OP(Add, 7, DataTypeImpl::AllFixedSizeTensorTypes()) \ NUPHAR_OP(ArgMax, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \ NUPHAR_OP(ArgMin, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \ - DISABLE_MACRO(NUPHAR_OP(AveragePool, 7, DataTypeImpl::AllFixedSizeTensorTypes())) \ + NUPHAR_VERSIONED_OP(AveragePool, 7, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ + NUPHAR_OP(AveragePool, 10, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ NUPHAR_OP(Ceil, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ NUPHAR_OP(Clip, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ NUPHAR_OP(Concat, 4, DataTypeImpl::AllFixedSizeTensorTypes()) \ @@ -94,8 +95,8 @@ class NupharKernelState { NUPHAR_OP(Floor, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ NUPHAR_VERSIONED_OP(Gemm, 7, 8, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ NUPHAR_OP(Gemm, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ - DISABLE_MACRO(NUPHAR_OP(GlobalAveragePool, 1, DataTypeImpl::AllFixedSizeTensorTypes())) \ - DISABLE_MACRO(NUPHAR_OP(GlobalMaxPool, 1, DataTypeImpl::AllFixedSizeTensorTypes())) \ + NUPHAR_OP(GlobalAveragePool, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ + NUPHAR_OP(GlobalMaxPool, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ NUPHAR_OP(Greater, 9, DataTypeImpl::AllFixedSizeTensorTypes()) \ NUPHAR_OP(HardSigmoid, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \ NUPHAR_OP(Identity, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \ @@ -107,8 +108,9 @@ class NupharKernelState { NUPHAR_VERSIONED_OP(MatMul, 1, 8, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ NUPHAR_OP(MatMul, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ NUPHAR_OP(Max, 8, DataTypeImpl::AllFixedSizeTensorTypes()) \ - DISABLE_MACRO(NUPHAR_VERSIONED_OP(MaxPool, 1, 7, DataTypeImpl::AllFixedSizeTensorTypes())) \ - DISABLE_MACRO(NUPHAR_OP(MaxPool, 8, DataTypeImpl::AllFixedSizeTensorTypes())) \ + NUPHAR_VERSIONED_OP(MaxPool, 1, 7, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ + NUPHAR_VERSIONED_OP(MaxPool, 8, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ + NUPHAR_OP(MaxPool, 10, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \ NUPHAR_OP(Min, 8, DataTypeImpl::AllFixedSizeTensorTypes()) \ NUPHAR_OP(Mul, 7, DataTypeImpl::AllFixedSizeTensorTypes()) \ NUPHAR_OP(Neg, 6, DataTypeImpl::AllFixedSizeTensorTypes()) \ diff --git a/onnxruntime/core/providers/nuphar/mti_x86/nn/pool_ops.cc b/onnxruntime/core/providers/nuphar/mti_x86/nn/pool_ops.cc new file mode 100644 index 0000000000..724721c91e --- /dev/null +++ b/onnxruntime/core/providers/nuphar/mti_x86/nn/pool_ops.cc @@ -0,0 +1,176 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/nuphar/mti_x86/nn/pool_ops.h" + +#include "core/codegen/mti/mti_tvm_utils.h" +#include "core/mlas/inc/mlas.h" +#include "core/providers/cpu/nn/pool_attributes.h" +#include + +namespace onnxruntime { +namespace nuphar { + +TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.pool_f32") + .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* /*ret*/) { + // input + DLTensor* X = args[0]; + DCHECK(tvm::runtime::TypeMatch(X->dtype, kDLFloat, 32)); + // output + DLTensor* Y = args[1]; + DCHECK(tvm::runtime::TypeMatch(Y->dtype, kDLFloat, 32)); + + // enum is not an integral type + int k = args[2]; + MLAS_POOLING_KIND kind = static_cast(k); + + int num_args = args.size(); + DCHECK(num_args > 3); + int arg_idx = 3; + + auto extract_values_fn = [&]() { + std::vector vec; + + DCHECK(arg_idx < num_args); + int64_t num_vec = args[arg_idx++]; + for (int i = 0; i < num_vec; i++, arg_idx++) { + DCHECK(arg_idx < num_args); + int64_t v = args[arg_idx]; + vec.push_back(v); + } + return vec; + }; + + std::vector kernel_shape = extract_values_fn(); + std::vector padding = extract_values_fn(); + std::vector strides = extract_values_fn(); + + MlasPool(kind, + /*num_pooling_dims*/ kernel_shape.size(), + /*input_shape*/ X->shape, + kernel_shape.data(), + padding.data(), + strides.data(), + /*output_shape*/ Y->shape, + reinterpret_cast(static_cast(X->data) + X->byte_offset), + reinterpret_cast(static_cast(Y->data) + Y->byte_offset), + /*thread_pool*/ nullptr); + }); + +TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.global_pool_f32") + .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* /*ret*/) { + // input + DLTensor* X = args[0]; + DCHECK(tvm::runtime::TypeMatch(X->dtype, kDLFloat, 32)); + // output + DLTensor* Y = args[1]; + DCHECK(tvm::runtime::TypeMatch(Y->dtype, kDLFloat, 32)); + + // enum is not an integral type + int k = args[2]; + MLAS_POOLING_KIND kind = static_cast(k); + + MlasPool(kind, + /*num_pooling_dims*/ X->ndim - 2, + /*input_shape*/ X->shape, + /*kernel_shape*/ nullptr, + /*padding*/ nullptr, + /*strides*/ nullptr, + /*output_shape*/ Y->shape, + reinterpret_cast(static_cast(X->data) + X->byte_offset), + reinterpret_cast(static_cast(Y->data) + Y->byte_offset), + /*thread_pool*/ nullptr); + }); + +static tvm::Tensor MakeGlobalPoolCommon(const tvm::Tensor& X, + const MLAS_POOLING_KIND kind, + const tvm::Array& output_shape, + const std::string& name) { + return topi::detail::make_extern( + /*output_shapes*/ {output_shape}, + /*output_types*/ {X->dtype}, + /*inputs*/ {X}, + [&](tvm::Array ins, tvm::Array outs) { + return topi::detail::call_packed({tvm::Expr("tvm.contrib.onnxruntime.global_pool_f32"), + topi::detail::pack_buffer(ins[0]), + topi::detail::pack_buffer(outs[0]), + static_cast(kind)}); + }, + name, /*tag*/ "", /*attrs*/ {})[0]; +} + +static tvm::Tensor MakePoolCommon(const tvm::Tensor& X, + const PoolAttributes& pool_attrs, + const MLAS_POOLING_KIND kind, + const tvm::Array& output_shape, + const std::string& name) { + size_t num_input_dims = X.ndim(); + ORT_ENFORCE(num_input_dims >= 3, "Input dimension must be >= 3"); + size_t num_pooling_dims = num_input_dims - 2; + ORT_ENFORCE(num_pooling_dims <= 3, "pooling size must be <= 3"); + ORT_ENFORCE(num_pooling_dims == pool_attrs.kernel_shape.size(), + "kernel_shape num_dims is not compatible with X num_dims."); + + tvm::Array pooling_args; + auto add_args_fn = [&](const std::vector& v) { + pooling_args.push_back(tvm::make_const(tvm::Int(64), static_cast(v.size()))); + for (auto n : v) { + pooling_args.push_back(tvm::make_const(tvm::Int(64), n)); + } + }; + add_args_fn(pool_attrs.kernel_shape); + add_args_fn(pool_attrs.pads); + add_args_fn(pool_attrs.strides); + + return topi::detail::make_extern( + /*output_shapes*/ {output_shape}, + /*output_types*/ {X->dtype}, + /*inputs*/ {X}, + [&](tvm::Array ins, tvm::Array outs) { + tvm::Array args = {tvm::Expr("tvm.contrib.onnxruntime.pool_f32"), + topi::detail::pack_buffer(ins[0]), + topi::detail::pack_buffer(outs[0]), + static_cast(kind)}; + // kernel_shape, padds and strides are directly passed into the external function + for (size_t i = 0; i < pooling_args.size(); i++) { + args.push_back(pooling_args[i]); + } + return topi::detail::call_packed(args); + }, + name, /*tag*/ "", /*attrs*/ {})[0]; +} + +tvm::Tensor AveragePool(const tvm::Tensor& X, + const PoolAttributes& pool_attrs, + const tvm::Array& output_shape, + const std::string& name) { + MLAS_POOLING_KIND kind = pool_attrs.count_include_pad ? MlasAveragePoolingIncludePad + : MlasAveragePoolingExcludePad; + return MakePoolCommon(X, pool_attrs, kind, output_shape, name); +} + +tvm::Tensor GlobalAveragePool(const tvm::Tensor& X, + const PoolAttributes& pool_attrs, + const tvm::Array& output_shape, + const std::string& name) { + MLAS_POOLING_KIND kind = pool_attrs.count_include_pad ? MlasAveragePoolingIncludePad + : MlasAveragePoolingExcludePad; + return MakeGlobalPoolCommon(X, kind, output_shape, name); +} + +tvm::Tensor MaxPool(const tvm::Tensor& X, + const PoolAttributes& pool_attrs, + const tvm::Array& output_shape, + const std::string& name) { + return MakePoolCommon(X, pool_attrs, MlasMaximumPooling, output_shape, name); +} + +tvm::Tensor GlobalMaxPool(const tvm::Tensor& X, + const PoolAttributes& /*pool_attrs*/, + const tvm::Array& output_shape, + const std::string& name) { + return MakeGlobalPoolCommon(X, MlasMaximumPooling, output_shape, name); +} + +} // namespace nuphar +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/nuphar/mti_x86/nn/pool_ops.h b/onnxruntime/core/providers/nuphar/mti_x86/nn/pool_ops.h new file mode 100644 index 0000000000..614c3d7deb --- /dev/null +++ b/onnxruntime/core/providers/nuphar/mti_x86/nn/pool_ops.h @@ -0,0 +1,37 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include + +namespace onnxruntime { + +// Forward declaration +struct PoolAttributes; + +namespace nuphar { + +tvm::Tensor AveragePool(const tvm::Tensor& X, + const PoolAttributes& pool_attrs, + const tvm::Array& output_shape, + const std::string& name = "average_pool"); + +tvm::Tensor GlobalAveragePool(const tvm::Tensor& X, + const PoolAttributes& pool_attrs, + const tvm::Array& output_shape, + const std::string& name = "global_average_pool"); + +tvm::Tensor MaxPool(const tvm::Tensor& X, + const PoolAttributes& pool_attrs, + const tvm::Array& output_shape, + const std::string& name = "max_pool"); + +tvm::Tensor GlobalMaxPool(const tvm::Tensor& X, + const PoolAttributes& pool_attrs, + const tvm::Array& output_shape, + const std::string& name = "global_max_pool"); + +} // namespace nuphar +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc index 7e2cd8502f..0b37c9a3ea 100644 --- a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc +++ b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc @@ -193,6 +193,23 @@ NupharExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie if (node.OpType() == "Tile" && !graph_viewer.IsConstantInitializer(inputs[1]->Name(), true)) return false; // do not support tile that has dynamic repeats + if (node.OpType() == "MaxPool") { + // TODO: enable support for Indices + if (node.OutputDefs().size() > 1) { + return false; + } + // TODO: enable support for non-default dilations + const onnxruntime::NodeAttributes& attrs = node.GetAttributes(); + auto it = attrs.find("dilations"); + if (it != attrs.end()) { + for (int i = 0; i < it->second.ints_size(); i++) { + if (it->second.ints(i) > 1) { + return false; + } + } + } + } + if (node.OpType() == "Slice") { auto num_inputs = inputs.size(); ORT_ENFORCE(num_inputs > 0);