use MLAS for nuphar's pool ops (#1937)

* call MLAS's pooling function as an external call for Nuphar

  Note that at the moment Nuphar provider doesn't handle the cases below:

  - symbolic height/weight dimensions
  - Indices output of MaxPool
  - non-default dilations

* unify the pool interface for mti and mti_x86
This commit is contained in:
Yang Chen 2019-09-26 16:29:30 -07:00 committed by GitHub
parent c3ffd1f47d
commit 650fb8754b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 417 additions and 125 deletions

View file

@ -3,6 +3,9 @@
#include "core/codegen/mti/nn/pool_ops.h"
#include "core/codegen/mti/mti_tvm_utils.h"
#include "core/mlas/inc/mlas.h"
#include "core/providers/cpu/nn/pool_attributes.h"
#include <topi/nn/pooling.h>
namespace onnxruntime {
@ -10,48 +13,50 @@ namespace tvm_codegen {
// TODO: topi only support 2d-pool, MaxPool1d and MaxPool3d will need to be added if necessary.
// only support version < 8 for topi doesn't come with implementation to output index tensor
tvm::Tensor MaxPool(
const tvm::Tensor& input,
const tvm::Array<tvm::Expr>& kernel_size,
const tvm::Array<tvm::Expr>& stride_size,
const tvm::Array<tvm::Expr>& padding_size,
const std::string& layout,
bool count_include_pad) {
return topi::nn::pool(input, kernel_size, stride_size, padding_size,
topi::nn::kMaxPool,
false,
layout,
count_include_pad);
tvm::Tensor MaxPool(const tvm::Tensor& input,
const PoolAttributes& pool_attrs,
const tvm::Array<tvm::Expr>& /*output_shape*/,
const std::string& /*name*/) {
return topi::nn::pool(input,
ToTvmArray(pool_attrs.kernel_shape),
ToTvmArray(pool_attrs.strides),
ToTvmArray(pool_attrs.pads),
/*pool_type*/ topi::nn::kMaxPool,
/*ceil_mode*/ false,
/*layout*/ pool_attrs.storage_order == 0 ? "NCWH" : "NCHW",
pool_attrs.count_include_pad);
}
tvm::Tensor AveragePool(
const tvm::Tensor& input,
const tvm::Array<tvm::Expr>& kernel_size,
const tvm::Array<tvm::Expr>& stride_size,
const tvm::Array<tvm::Expr>& padding_size,
const std::string& layout,
bool count_include_pad) {
return topi::nn::pool(input, kernel_size, stride_size, padding_size,
topi::nn::kAvgPool,
false,
layout,
count_include_pad);
tvm::Tensor AveragePool(const tvm::Tensor& input,
const PoolAttributes& pool_attrs,
const tvm::Array<tvm::Expr>& /*output_shape*/,
const std::string& /*name*/) {
return topi::nn::pool(input,
ToTvmArray(pool_attrs.kernel_shape),
ToTvmArray(pool_attrs.strides),
ToTvmArray(pool_attrs.pads),
/*pool_type*/ topi::nn::kAvgPool,
/*ceil_mode*/ false,
/*layout*/ "NCHW",
pool_attrs.count_include_pad);
}
tvm::Tensor GlobalMaxPool(
const tvm::Tensor& input,
const std::string& layout) {
tvm::Tensor GlobalMaxPool(const tvm::Tensor& input,
const PoolAttributes& /*pool_attrs*/,
const tvm::Array<tvm::Expr>& /*output_shape*/,
const std::string& /*name*/) {
return topi::nn::global_pool(input,
topi::nn::kMaxPool,
layout);
/*pool_type*/ topi::nn::kMaxPool,
/*layout*/ "NCHW");
}
tvm::Tensor GlobalAveragePool(
const tvm::Tensor& input,
const std::string& layout) {
tvm::Tensor GlobalAveragePool(const tvm::Tensor& input,
const PoolAttributes& /*pool_attrs*/,
const tvm::Array<tvm::Expr>& /*output_shape*/,
const std::string& /*name*/) {
return topi::nn::global_pool(input,
topi::nn::kAvgPool,
layout);
/*pool_type*/ topi::nn::kAvgPool,
/*layout*/ "NCHW");
}
} // namespace tvm_codegen

View file

@ -6,27 +6,31 @@
#include <tvm/tvm.h>
namespace onnxruntime {
// Forward declaration
struct PoolAttributes;
namespace tvm_codegen {
tvm::Tensor MaxPool(const tvm::Tensor& input,
const tvm::Array<tvm::Expr>& kernel_size,
const tvm::Array<tvm::Expr>& stride_size,
const tvm::Array<tvm::Expr>& padding_size,
const std::string& layout,
bool count_include_pad);
const PoolAttributes& pool_attrs,
const tvm::Array<tvm::Expr>& output_shape,
const std::string& name = "max_pool");
tvm::Tensor AveragePool(const tvm::Tensor& input,
const tvm::Array<tvm::Expr>& kernel_size,
const tvm::Array<tvm::Expr>& stride_size,
const tvm::Array<tvm::Expr>& padding_size,
const std::string& layout,
bool count_include_pad);
const PoolAttributes& pool_attrs,
const tvm::Array<tvm::Expr>& output_shape,
const std::string& name = "average_pool");
tvm::Tensor GlobalMaxPool(const tvm::Tensor& input,
const std::string& layout);
const PoolAttributes& pool_attrs,
const tvm::Array<tvm::Expr>& output_shape,
const std::string& name = "global_max_pool");
tvm::Tensor GlobalAveragePool(const tvm::Tensor& input,
const std::string& layout);
const PoolAttributes& pool_attrs,
const tvm::Array<tvm::Expr>& output_shape,
const std::string& name = "global_average_pool");
} // namespace tvm_codegen
} // namespace onnxruntime

View file

@ -6,86 +6,44 @@
#include "core/codegen/mti/mti_tvm_utils.h"
#include "core/codegen/mti/nn/pool_ops.h"
#include "core/framework/op_kernel_info.h"
#include "core/providers/cpu/nn/pool_attributes.h"
namespace onnxruntime {
namespace tvm_codegen {
// helper class for pool_ops with arguments
class FuncWithPoolingArgument {
public:
FuncWithPoolingArgument(const Node& node, const std::string& op_name) {
ProtoHelperNodeContext ctx(node);
OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);
int64_t storage_order{0}; // MaxPool_8 only. 0 is row major, and 1 is column major. Default is 0.
ORT_ENFORCE(info.GetAttrs<int64_t>("kernel_shape", kernel_shape_).IsOK(), "No kernel shape is set.");
if (kernel_shape_.size() != 2)
ORT_NOT_IMPLEMENTED(kernel_shape_.size(), "d pooling is not implementated");
if (!info.GetAttrs<int64_t>("pads", pads_).IsOK() || pads_.empty()) {
pads_.resize(kernel_shape_.size() * 2, 0);
}
if (!info.GetAttrs<int64_t>("strides", strides_).IsOK() || strides_.empty()) {
strides_.resize(kernel_shape_.size(), 1);
}
if (op_name == "AveragePool") {
int64_t temp;
ORT_ENFORCE(info.GetAttr<int64_t>("count_include_pad", &temp).IsOK());
count_include_pad_ = (temp != 0);
}
if (op_name == "MaxPool") {
// TODO: add version check or not? remove version check since only after version 8 would have storage_order, otherwise, it would be zero
storage_order = info.GetAttrOrDefault<int64_t>("storage_order", 0 /*default_value*/);
if (storage_order != 1) {
layout_ = "NCWH";
}
}
}
std::vector<int64_t> kernel_shape_;
std::vector<int64_t> pads_;
std::vector<int64_t> strides_;
std::string layout_ = "NCHW";
bool count_include_pad_ = false;
};
// A local macro to create Pool Ops
// helper macro defines Evaluate of of POOL_OP OpIRCreators
#define POOL_OP(name) \
Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \
const tvm::Array<tvm::Tensor>& inputs, \
const Node& node, \
CodeGenContext&, \
tvm::Array<tvm::Tensor>& outputs) { \
if (outputs.size() > 1) ORT_NOT_IMPLEMENTED("output size = 2 is not implementated"); \
FuncWithPoolingArgument argment(node, #name); \
tvm::Tensor Y = name(inputs[0], ToTvmArray(argment.kernel_shape_), ToTvmArray(argment.strides_), ToTvmArray(argment.pads_), argment.layout_, argment.count_include_pad_); \
outputs.push_back(Y); \
return Status::OK(); \
} // namespace tvm_codegen
POOL_OP(MaxPool)
POOL_OP(AveragePool)
#undef POOL_OP
// helper macro defines Evaluate of of GlobalPOOL_OP OpIRCreators
#define POOL_OP(name) \
Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \
const tvm::Array<tvm::Tensor>& inputs, \
const Node& node, \
CodeGenContext&, \
tvm::Array<tvm::Tensor>& outputs) { \
if (inputs[0]->shape.size() != 4) \
ORT_NOT_IMPLEMENTED(gsl::narrow_cast<int64_t>(inputs[0]->shape.size()) - 2, "d global pooling is not implementated"); \
tvm::Tensor Y = name(inputs[0], "NCHW"); \
outputs.push_back(Y); \
return Status::OK(); \
#define POOL_OP(name) \
Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \
const tvm::Array<tvm::Tensor>& inputs, \
const Node& node, \
CodeGenContext& ctx_codegen, \
tvm::Array<tvm::Tensor>& outputs) { \
ORT_RETURN_IF_NOT(outputs.size() == 1, "multiple outputs are not supported yet!"); \
ProtoHelperNodeContext ctx(node); \
OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx); \
int version = ctx_codegen.GetCodeGenHandle()->domain_version_lookup_func(node.Domain()); \
PoolAttributes pool_attrs(info, #name, version); \
for (auto n : pool_attrs.dilations) { \
ORT_RETURN_IF_NOT(n <= 1, "dilations are not supported yet!"); \
} \
if (pool_attrs.global_pooling) { \
if (inputs[0]->shape.size() != 4) { \
ORT_NOT_IMPLEMENTED(gsl::narrow_cast<int64_t>(inputs[0]->shape.size()) - 2, "d global pooling is not implementated"); \
} \
} else { \
if (pool_attrs.kernel_shape.size() != 2) { \
ORT_NOT_IMPLEMENTED(pool_attrs.kernel_shape.size(), "d pooling is not implementated"); \
} \
} \
tvm::Array<tvm::Expr> dummy_output_shape; \
tvm::Tensor Y = name(inputs[0], pool_attrs, dummy_output_shape); \
outputs.push_back(Y); \
return Status::OK(); \
}
POOL_OP(GlobalMaxPool)
POOL_OP(GlobalAveragePool)
LIST_POOL_OPS()
#undef POOL_OP

View file

@ -3,7 +3,6 @@
#include "core/providers/nuphar/compiler/codegen_manager.h"
#include "core/codegen/common/op_macro.h"
#include "core/codegen/passes/op_ir_creator/all_ops.h"
#include "core/codegen/passes/scheduler/all_schedules.h"
#include "core/codegen/passes/weight_layout/transpose_2d.h"
@ -27,6 +26,7 @@ namespace nuphar {
#define ADD_OP_ITEM(name) \
op_ir_registry->Register(std::move(std::make_unique<NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(name)>()));
#define POOL_OP(OP) ADD_OP_ITEM(OP)
#define REDUCE_V_OP(name) ADD_OP_ITEM(name)
#define UNARY_OP(name) ADD_OP_ITEM(name)
@ -35,6 +35,7 @@ static void RegisterAllNupharX86OpIRCreators(tvm_codegen::OpIRRegistry* op_ir_re
}
#undef ADD_OP_ITEM
#undef POOL_OP
#undef REDUCE_V_OP
#undef UNARY_OP
@ -117,6 +118,7 @@ static void RegisterAllNupharWeightLayouts(tvm_codegen::WeightLayoutRegistry* la
#define ADD_OP_ITEM(name) \
dispatcher->Register(#name, registry->Get(NUPHAR_TVM_X86_OP_IR_CREATOR_STRING(name)));
#define POOL_OP(OP) ADD_OP_ITEM(OP)
#define REDUCE_V_OP(name) ADD_OP_ITEM(name)
#define UNARY_OP(name) ADD_OP_ITEM(name)
@ -128,6 +130,7 @@ static void RegisterNupharX86Dispatcher(const std::shared_ptr<tvm_codegen::TVMIR
}
#undef ADD_OP_ITEM
#undef POOL_OP
#undef REDUCE_V_OP
#undef UNARY_OP
// END: Nuphar TVM X86 IR creator classes

View file

@ -3,7 +3,6 @@
#include "core/providers/nuphar/compiler/nuphar_op_ir_builder.h"
#include "core/codegen/common/op_macro.h"
#include "core/codegen/mti/mti_tvm_utils.h"
#include "core/codegen/passes/op_ir_creator/all_ops.h"
#include "core/codegen/passes/op_ir_creator/tvm_ir_builder.h"

View file

@ -21,6 +21,12 @@ namespace nuphar {
#define NUPHAR_TVM_X86_OP_IR_CREATOR_STRING(OP) \
STRINGIZE(NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(OP))
#define LIST_X86_POOL_OPS() \
POOL_OP(MaxPool) \
POOL_OP(AveragePool) \
POOL_OP(GlobalMaxPool) \
POOL_OP(GlobalAveragePool)
#define LIST_X86_UNARY_OPS() \
UNARY_OP(Erf) \
UNARY_OP(Exp) \
@ -39,6 +45,7 @@ namespace nuphar {
#define LIST_ALL_X86_OPS() \
LIST_REDUCE_V_OPS() \
LIST_X86_POOL_OPS() \
LIST_X86_UNARY_OPS() \
ADD_OP_ITEM(Gemm) \
ADD_OP_ITEM(LogSoftmax) \
@ -51,6 +58,7 @@ namespace nuphar {
// Define all OPs for NupharTVMX86
#define ADD_OP_ITEM(OP) DECLARE_NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(OP)
#define POOL_OP(OP) ADD_OP_ITEM(OP)
#define REDUCE_V_OP(OP) ADD_OP_ITEM(OP)
#define UNARY_OP(OP) ADD_OP_ITEM(OP)
@ -58,6 +66,7 @@ LIST_ALL_X86_OPS()
#undef ADD_OP_ITEM
#undef REDUCE_V_OP
#undef POOL_OP
#undef UNARY_OP
} // namespace nuphar

View file

@ -3,7 +3,6 @@
#include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h"
#include "core/codegen/common/op_macro.h"
#include "core/framework/op_kernel_info.h"
#include "core/providers/nuphar/mti_x86/math/unary_ops.h"

View file

@ -0,0 +1,83 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h"
#include "core/codegen/passes/utils/ort_tvm_utils.h"
#include "core/framework/op_kernel_info.h"
#include "core/providers/cpu/nn/pool_attributes.h"
#include "core/providers/nuphar/mti_x86/nn/pool_ops.h"
namespace onnxruntime {
namespace nuphar {
static tvm::Array<tvm::Expr> GetOutputShapeAndPads(const Node& node,
PoolAttributes& pool_attrs,
tvm_codegen::CodeGenContext& ctx_codegen) {
const NodeArg* input = node.InputDefs()[0];
ORT_ENFORCE(input);
const ONNX_NAMESPACE::TensorShapeProto* shape_proto = input->Shape();
size_t num_input_dims = shape_proto->dim_size();
ORT_ENFORCE(num_input_dims >= 2);
tvm::Array<tvm::Expr> output_shape;
// batch dimenion
output_shape.push_back(ShapeDimToTvmDim(shape_proto->dim(0), ctx_codegen));
// output channel
output_shape.push_back(ShapeDimToTvmDim(shape_proto->dim(1), ctx_codegen));
size_t kernel_sz = pool_attrs.kernel_shape.size();
if (pool_attrs.global_pooling) {
pool_attrs.pads.assign(kernel_sz, 0);
// skip batch and channel dimensions, so dim starts from 2
for (size_t dim = 2; dim < num_input_dims; dim++) {
output_shape.push_back(tvm::make_const(tvm::Int(32), 1));
}
} else {
ORT_ENFORCE(num_input_dims > kernel_sz);
size_t kernel_idx_offset = num_input_dims - kernel_sz;
for (size_t dim = 0; dim < kernel_sz; dim++) {
// TODO: handle symbolic dimensions
ORT_ENFORCE(ShapeHasValue(input, dim + kernel_idx_offset));
int64_t dim_val = ShapeValue(input, dim + kernel_idx_offset);
int64_t dim_size = 0;
pool_attrs.ComputeSizePadDilations(static_cast<int>(dim_val),
pool_attrs.strides[dim],
pool_attrs.kernel_shape[dim],
&(pool_attrs.pads[dim]),
&(pool_attrs.pads[kernel_sz + dim]),
pool_attrs.dilations[dim],
&dim_size);
output_shape.push_back(tvm::make_const(tvm::Int(32), dim_size));
}
}
return output_shape;
}
#define POOL_OP(name) \
Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(name)::Evaluate( \
const tvm::Array<tvm::Tensor>& inputs, \
const Node& node, \
tvm_codegen::CodeGenContext& ctx_codegen, \
tvm::Array<tvm::Tensor>& outputs) { \
ORT_RETURN_IF_NOT(node.OutputDefs().size() == 1, " multiple outputs are not supported yet!"); \
ORT_RETURN_IF_NOT(inputs[0]->dtype == HalideIR::Float(32), " non-float32 not supported yet"); \
ProtoHelperNodeContext ctx(node); \
OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx); \
int version = ctx_codegen.GetCodeGenHandle()->domain_version_lookup_func(node.Domain()); \
PoolAttributes pool_attrs(info, #name, version); \
for (auto n : pool_attrs.dilations) { \
ORT_RETURN_IF_NOT(n <= 1, "dilations are not supported yet!"); \
} \
tvm::Array<tvm::Expr> output_shape = GetOutputShapeAndPads(node, pool_attrs, ctx_codegen); \
tvm::Tensor Y = name(inputs[0], pool_attrs, output_shape); \
outputs.push_back(Y); \
return Status::OK(); \
} \
LIST_X86_POOL_OPS()
#undef POOL_OP
} // namespace nuphar
} // namespace onnxruntime

View file

@ -77,7 +77,8 @@ class NupharKernelState {
NUPHAR_OP(Add, 7, DataTypeImpl::AllFixedSizeTensorTypes()) \
NUPHAR_OP(ArgMax, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \
NUPHAR_OP(ArgMin, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \
DISABLE_MACRO(NUPHAR_OP(AveragePool, 7, DataTypeImpl::AllFixedSizeTensorTypes())) \
NUPHAR_VERSIONED_OP(AveragePool, 7, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
NUPHAR_OP(AveragePool, 10, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
NUPHAR_OP(Ceil, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \
NUPHAR_OP(Clip, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \
NUPHAR_OP(Concat, 4, DataTypeImpl::AllFixedSizeTensorTypes()) \
@ -94,8 +95,8 @@ class NupharKernelState {
NUPHAR_OP(Floor, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \
NUPHAR_VERSIONED_OP(Gemm, 7, 8, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
NUPHAR_OP(Gemm, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
DISABLE_MACRO(NUPHAR_OP(GlobalAveragePool, 1, DataTypeImpl::AllFixedSizeTensorTypes())) \
DISABLE_MACRO(NUPHAR_OP(GlobalMaxPool, 1, DataTypeImpl::AllFixedSizeTensorTypes())) \
NUPHAR_OP(GlobalAveragePool, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
NUPHAR_OP(GlobalMaxPool, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
NUPHAR_OP(Greater, 9, DataTypeImpl::AllFixedSizeTensorTypes()) \
NUPHAR_OP(HardSigmoid, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \
NUPHAR_OP(Identity, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \
@ -107,8 +108,9 @@ class NupharKernelState {
NUPHAR_VERSIONED_OP(MatMul, 1, 8, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
NUPHAR_OP(MatMul, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
NUPHAR_OP(Max, 8, DataTypeImpl::AllFixedSizeTensorTypes()) \
DISABLE_MACRO(NUPHAR_VERSIONED_OP(MaxPool, 1, 7, DataTypeImpl::AllFixedSizeTensorTypes())) \
DISABLE_MACRO(NUPHAR_OP(MaxPool, 8, DataTypeImpl::AllFixedSizeTensorTypes())) \
NUPHAR_VERSIONED_OP(MaxPool, 1, 7, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
NUPHAR_VERSIONED_OP(MaxPool, 8, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
NUPHAR_OP(MaxPool, 10, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
NUPHAR_OP(Min, 8, DataTypeImpl::AllFixedSizeTensorTypes()) \
NUPHAR_OP(Mul, 7, DataTypeImpl::AllFixedSizeTensorTypes()) \
NUPHAR_OP(Neg, 6, DataTypeImpl::AllFixedSizeTensorTypes()) \

View file

@ -0,0 +1,176 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/nuphar/mti_x86/nn/pool_ops.h"
#include "core/codegen/mti/mti_tvm_utils.h"
#include "core/mlas/inc/mlas.h"
#include "core/providers/cpu/nn/pool_attributes.h"
#include <topi/detail/extern.h>
namespace onnxruntime {
namespace nuphar {
TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.pool_f32")
.set_body([](tvm::TVMArgs args, tvm::TVMRetValue* /*ret*/) {
// input
DLTensor* X = args[0];
DCHECK(tvm::runtime::TypeMatch(X->dtype, kDLFloat, 32));
// output
DLTensor* Y = args[1];
DCHECK(tvm::runtime::TypeMatch(Y->dtype, kDLFloat, 32));
// enum is not an integral type
int k = args[2];
MLAS_POOLING_KIND kind = static_cast<MLAS_POOLING_KIND>(k);
int num_args = args.size();
DCHECK(num_args > 3);
int arg_idx = 3;
auto extract_values_fn = [&]() {
std::vector<int64_t> vec;
DCHECK(arg_idx < num_args);
int64_t num_vec = args[arg_idx++];
for (int i = 0; i < num_vec; i++, arg_idx++) {
DCHECK(arg_idx < num_args);
int64_t v = args[arg_idx];
vec.push_back(v);
}
return vec;
};
std::vector<int64_t> kernel_shape = extract_values_fn();
std::vector<int64_t> padding = extract_values_fn();
std::vector<int64_t> strides = extract_values_fn();
MlasPool(kind,
/*num_pooling_dims*/ kernel_shape.size(),
/*input_shape*/ X->shape,
kernel_shape.data(),
padding.data(),
strides.data(),
/*output_shape*/ Y->shape,
reinterpret_cast<float*>(static_cast<char*>(X->data) + X->byte_offset),
reinterpret_cast<float*>(static_cast<char*>(Y->data) + Y->byte_offset),
/*thread_pool*/ nullptr);
});
TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.global_pool_f32")
.set_body([](tvm::TVMArgs args, tvm::TVMRetValue* /*ret*/) {
// input
DLTensor* X = args[0];
DCHECK(tvm::runtime::TypeMatch(X->dtype, kDLFloat, 32));
// output
DLTensor* Y = args[1];
DCHECK(tvm::runtime::TypeMatch(Y->dtype, kDLFloat, 32));
// enum is not an integral type
int k = args[2];
MLAS_POOLING_KIND kind = static_cast<MLAS_POOLING_KIND>(k);
MlasPool(kind,
/*num_pooling_dims*/ X->ndim - 2,
/*input_shape*/ X->shape,
/*kernel_shape*/ nullptr,
/*padding*/ nullptr,
/*strides*/ nullptr,
/*output_shape*/ Y->shape,
reinterpret_cast<float*>(static_cast<char*>(X->data) + X->byte_offset),
reinterpret_cast<float*>(static_cast<char*>(Y->data) + Y->byte_offset),
/*thread_pool*/ nullptr);
});
static tvm::Tensor MakeGlobalPoolCommon(const tvm::Tensor& X,
const MLAS_POOLING_KIND kind,
const tvm::Array<tvm::Expr>& output_shape,
const std::string& name) {
return topi::detail::make_extern(
/*output_shapes*/ {output_shape},
/*output_types*/ {X->dtype},
/*inputs*/ {X},
[&](tvm::Array<tvm::Buffer> ins, tvm::Array<tvm::Buffer> outs) {
return topi::detail::call_packed({tvm::Expr("tvm.contrib.onnxruntime.global_pool_f32"),
topi::detail::pack_buffer(ins[0]),
topi::detail::pack_buffer(outs[0]),
static_cast<int>(kind)});
},
name, /*tag*/ "", /*attrs*/ {})[0];
}
static tvm::Tensor MakePoolCommon(const tvm::Tensor& X,
const PoolAttributes& pool_attrs,
const MLAS_POOLING_KIND kind,
const tvm::Array<tvm::Expr>& output_shape,
const std::string& name) {
size_t num_input_dims = X.ndim();
ORT_ENFORCE(num_input_dims >= 3, "Input dimension must be >= 3");
size_t num_pooling_dims = num_input_dims - 2;
ORT_ENFORCE(num_pooling_dims <= 3, "pooling size must be <= 3");
ORT_ENFORCE(num_pooling_dims == pool_attrs.kernel_shape.size(),
"kernel_shape num_dims is not compatible with X num_dims.");
tvm::Array<tvm::Expr> pooling_args;
auto add_args_fn = [&](const std::vector<int64_t>& v) {
pooling_args.push_back(tvm::make_const(tvm::Int(64), static_cast<int64_t>(v.size())));
for (auto n : v) {
pooling_args.push_back(tvm::make_const(tvm::Int(64), n));
}
};
add_args_fn(pool_attrs.kernel_shape);
add_args_fn(pool_attrs.pads);
add_args_fn(pool_attrs.strides);
return topi::detail::make_extern(
/*output_shapes*/ {output_shape},
/*output_types*/ {X->dtype},
/*inputs*/ {X},
[&](tvm::Array<tvm::Buffer> ins, tvm::Array<tvm::Buffer> outs) {
tvm::Array<tvm::Expr> args = {tvm::Expr("tvm.contrib.onnxruntime.pool_f32"),
topi::detail::pack_buffer(ins[0]),
topi::detail::pack_buffer(outs[0]),
static_cast<int>(kind)};
// kernel_shape, padds and strides are directly passed into the external function
for (size_t i = 0; i < pooling_args.size(); i++) {
args.push_back(pooling_args[i]);
}
return topi::detail::call_packed(args);
},
name, /*tag*/ "", /*attrs*/ {})[0];
}
tvm::Tensor AveragePool(const tvm::Tensor& X,
const PoolAttributes& pool_attrs,
const tvm::Array<tvm::Expr>& output_shape,
const std::string& name) {
MLAS_POOLING_KIND kind = pool_attrs.count_include_pad ? MlasAveragePoolingIncludePad
: MlasAveragePoolingExcludePad;
return MakePoolCommon(X, pool_attrs, kind, output_shape, name);
}
tvm::Tensor GlobalAveragePool(const tvm::Tensor& X,
const PoolAttributes& pool_attrs,
const tvm::Array<tvm::Expr>& output_shape,
const std::string& name) {
MLAS_POOLING_KIND kind = pool_attrs.count_include_pad ? MlasAveragePoolingIncludePad
: MlasAveragePoolingExcludePad;
return MakeGlobalPoolCommon(X, kind, output_shape, name);
}
tvm::Tensor MaxPool(const tvm::Tensor& X,
const PoolAttributes& pool_attrs,
const tvm::Array<tvm::Expr>& output_shape,
const std::string& name) {
return MakePoolCommon(X, pool_attrs, MlasMaximumPooling, output_shape, name);
}
tvm::Tensor GlobalMaxPool(const tvm::Tensor& X,
const PoolAttributes& /*pool_attrs*/,
const tvm::Array<tvm::Expr>& output_shape,
const std::string& name) {
return MakeGlobalPoolCommon(X, MlasMaximumPooling, output_shape, name);
}
} // namespace nuphar
} // namespace onnxruntime

View file

@ -0,0 +1,37 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <string>
#include <tvm/tvm.h>
namespace onnxruntime {
// Forward declaration
struct PoolAttributes;
namespace nuphar {
tvm::Tensor AveragePool(const tvm::Tensor& X,
const PoolAttributes& pool_attrs,
const tvm::Array<tvm::Expr>& output_shape,
const std::string& name = "average_pool");
tvm::Tensor GlobalAveragePool(const tvm::Tensor& X,
const PoolAttributes& pool_attrs,
const tvm::Array<tvm::Expr>& output_shape,
const std::string& name = "global_average_pool");
tvm::Tensor MaxPool(const tvm::Tensor& X,
const PoolAttributes& pool_attrs,
const tvm::Array<tvm::Expr>& output_shape,
const std::string& name = "max_pool");
tvm::Tensor GlobalMaxPool(const tvm::Tensor& X,
const PoolAttributes& pool_attrs,
const tvm::Array<tvm::Expr>& output_shape,
const std::string& name = "global_max_pool");
} // namespace nuphar
} // namespace onnxruntime

View file

@ -193,6 +193,23 @@ NupharExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie
if (node.OpType() == "Tile" && !graph_viewer.IsConstantInitializer(inputs[1]->Name(), true))
return false; // do not support tile that has dynamic repeats
if (node.OpType() == "MaxPool") {
// TODO: enable support for Indices
if (node.OutputDefs().size() > 1) {
return false;
}
// TODO: enable support for non-default dilations
const onnxruntime::NodeAttributes& attrs = node.GetAttributes();
auto it = attrs.find("dilations");
if (it != attrs.end()) {
for (int i = 0; i < it->second.ints_size(); i++) {
if (it->second.ints(i) > 1) {
return false;
}
}
}
}
if (node.OpType() == "Slice") {
auto num_inputs = inputs.size();
ORT_ENFORCE(num_inputs > 0);