mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-31 23:27:43 +00:00
use MLAS for nuphar's pool ops (#1937)
* call MLAS's pooling function as an external call for Nuphar Note that at the moment Nuphar provider doesn't handle the cases below: - symbolic height/weight dimensions - Indices output of MaxPool - non-default dilations * unify the pool interface for mti and mti_x86
This commit is contained in:
parent
c3ffd1f47d
commit
650fb8754b
12 changed files with 417 additions and 125 deletions
|
|
@ -3,6 +3,9 @@
|
|||
|
||||
#include "core/codegen/mti/nn/pool_ops.h"
|
||||
|
||||
#include "core/codegen/mti/mti_tvm_utils.h"
|
||||
#include "core/mlas/inc/mlas.h"
|
||||
#include "core/providers/cpu/nn/pool_attributes.h"
|
||||
#include <topi/nn/pooling.h>
|
||||
|
||||
namespace onnxruntime {
|
||||
|
|
@ -10,48 +13,50 @@ namespace tvm_codegen {
|
|||
|
||||
// TODO: topi only support 2d-pool, MaxPool1d and MaxPool3d will need to be added if necessary.
|
||||
// only support version < 8 for topi doesn't come with implementation to output index tensor
|
||||
tvm::Tensor MaxPool(
|
||||
const tvm::Tensor& input,
|
||||
const tvm::Array<tvm::Expr>& kernel_size,
|
||||
const tvm::Array<tvm::Expr>& stride_size,
|
||||
const tvm::Array<tvm::Expr>& padding_size,
|
||||
const std::string& layout,
|
||||
bool count_include_pad) {
|
||||
return topi::nn::pool(input, kernel_size, stride_size, padding_size,
|
||||
topi::nn::kMaxPool,
|
||||
false,
|
||||
layout,
|
||||
count_include_pad);
|
||||
tvm::Tensor MaxPool(const tvm::Tensor& input,
|
||||
const PoolAttributes& pool_attrs,
|
||||
const tvm::Array<tvm::Expr>& /*output_shape*/,
|
||||
const std::string& /*name*/) {
|
||||
return topi::nn::pool(input,
|
||||
ToTvmArray(pool_attrs.kernel_shape),
|
||||
ToTvmArray(pool_attrs.strides),
|
||||
ToTvmArray(pool_attrs.pads),
|
||||
/*pool_type*/ topi::nn::kMaxPool,
|
||||
/*ceil_mode*/ false,
|
||||
/*layout*/ pool_attrs.storage_order == 0 ? "NCWH" : "NCHW",
|
||||
pool_attrs.count_include_pad);
|
||||
}
|
||||
|
||||
tvm::Tensor AveragePool(
|
||||
const tvm::Tensor& input,
|
||||
const tvm::Array<tvm::Expr>& kernel_size,
|
||||
const tvm::Array<tvm::Expr>& stride_size,
|
||||
const tvm::Array<tvm::Expr>& padding_size,
|
||||
const std::string& layout,
|
||||
bool count_include_pad) {
|
||||
return topi::nn::pool(input, kernel_size, stride_size, padding_size,
|
||||
topi::nn::kAvgPool,
|
||||
false,
|
||||
layout,
|
||||
count_include_pad);
|
||||
tvm::Tensor AveragePool(const tvm::Tensor& input,
|
||||
const PoolAttributes& pool_attrs,
|
||||
const tvm::Array<tvm::Expr>& /*output_shape*/,
|
||||
const std::string& /*name*/) {
|
||||
return topi::nn::pool(input,
|
||||
ToTvmArray(pool_attrs.kernel_shape),
|
||||
ToTvmArray(pool_attrs.strides),
|
||||
ToTvmArray(pool_attrs.pads),
|
||||
/*pool_type*/ topi::nn::kAvgPool,
|
||||
/*ceil_mode*/ false,
|
||||
/*layout*/ "NCHW",
|
||||
pool_attrs.count_include_pad);
|
||||
}
|
||||
|
||||
tvm::Tensor GlobalMaxPool(
|
||||
const tvm::Tensor& input,
|
||||
const std::string& layout) {
|
||||
tvm::Tensor GlobalMaxPool(const tvm::Tensor& input,
|
||||
const PoolAttributes& /*pool_attrs*/,
|
||||
const tvm::Array<tvm::Expr>& /*output_shape*/,
|
||||
const std::string& /*name*/) {
|
||||
return topi::nn::global_pool(input,
|
||||
topi::nn::kMaxPool,
|
||||
layout);
|
||||
/*pool_type*/ topi::nn::kMaxPool,
|
||||
/*layout*/ "NCHW");
|
||||
}
|
||||
|
||||
tvm::Tensor GlobalAveragePool(
|
||||
const tvm::Tensor& input,
|
||||
const std::string& layout) {
|
||||
tvm::Tensor GlobalAveragePool(const tvm::Tensor& input,
|
||||
const PoolAttributes& /*pool_attrs*/,
|
||||
const tvm::Array<tvm::Expr>& /*output_shape*/,
|
||||
const std::string& /*name*/) {
|
||||
return topi::nn::global_pool(input,
|
||||
topi::nn::kAvgPool,
|
||||
layout);
|
||||
/*pool_type*/ topi::nn::kAvgPool,
|
||||
/*layout*/ "NCHW");
|
||||
}
|
||||
|
||||
} // namespace tvm_codegen
|
||||
|
|
|
|||
|
|
@ -6,27 +6,31 @@
|
|||
#include <tvm/tvm.h>
|
||||
|
||||
namespace onnxruntime {
|
||||
|
||||
// Forward declaration
|
||||
struct PoolAttributes;
|
||||
|
||||
namespace tvm_codegen {
|
||||
|
||||
tvm::Tensor MaxPool(const tvm::Tensor& input,
|
||||
const tvm::Array<tvm::Expr>& kernel_size,
|
||||
const tvm::Array<tvm::Expr>& stride_size,
|
||||
const tvm::Array<tvm::Expr>& padding_size,
|
||||
const std::string& layout,
|
||||
bool count_include_pad);
|
||||
const PoolAttributes& pool_attrs,
|
||||
const tvm::Array<tvm::Expr>& output_shape,
|
||||
const std::string& name = "max_pool");
|
||||
|
||||
tvm::Tensor AveragePool(const tvm::Tensor& input,
|
||||
const tvm::Array<tvm::Expr>& kernel_size,
|
||||
const tvm::Array<tvm::Expr>& stride_size,
|
||||
const tvm::Array<tvm::Expr>& padding_size,
|
||||
const std::string& layout,
|
||||
bool count_include_pad);
|
||||
const PoolAttributes& pool_attrs,
|
||||
const tvm::Array<tvm::Expr>& output_shape,
|
||||
const std::string& name = "average_pool");
|
||||
|
||||
tvm::Tensor GlobalMaxPool(const tvm::Tensor& input,
|
||||
const std::string& layout);
|
||||
const PoolAttributes& pool_attrs,
|
||||
const tvm::Array<tvm::Expr>& output_shape,
|
||||
const std::string& name = "global_max_pool");
|
||||
|
||||
tvm::Tensor GlobalAveragePool(const tvm::Tensor& input,
|
||||
const std::string& layout);
|
||||
const PoolAttributes& pool_attrs,
|
||||
const tvm::Array<tvm::Expr>& output_shape,
|
||||
const std::string& name = "global_average_pool");
|
||||
|
||||
} // namespace tvm_codegen
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -6,86 +6,44 @@
|
|||
#include "core/codegen/mti/mti_tvm_utils.h"
|
||||
#include "core/codegen/mti/nn/pool_ops.h"
|
||||
#include "core/framework/op_kernel_info.h"
|
||||
#include "core/providers/cpu/nn/pool_attributes.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace tvm_codegen {
|
||||
|
||||
// helper class for pool_ops with arguments
|
||||
class FuncWithPoolingArgument {
|
||||
public:
|
||||
FuncWithPoolingArgument(const Node& node, const std::string& op_name) {
|
||||
ProtoHelperNodeContext ctx(node);
|
||||
OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);
|
||||
int64_t storage_order{0}; // MaxPool_8 only. 0 is row major, and 1 is column major. Default is 0.
|
||||
|
||||
ORT_ENFORCE(info.GetAttrs<int64_t>("kernel_shape", kernel_shape_).IsOK(), "No kernel shape is set.");
|
||||
if (kernel_shape_.size() != 2)
|
||||
ORT_NOT_IMPLEMENTED(kernel_shape_.size(), "d pooling is not implementated");
|
||||
if (!info.GetAttrs<int64_t>("pads", pads_).IsOK() || pads_.empty()) {
|
||||
pads_.resize(kernel_shape_.size() * 2, 0);
|
||||
}
|
||||
if (!info.GetAttrs<int64_t>("strides", strides_).IsOK() || strides_.empty()) {
|
||||
strides_.resize(kernel_shape_.size(), 1);
|
||||
}
|
||||
if (op_name == "AveragePool") {
|
||||
int64_t temp;
|
||||
ORT_ENFORCE(info.GetAttr<int64_t>("count_include_pad", &temp).IsOK());
|
||||
count_include_pad_ = (temp != 0);
|
||||
}
|
||||
|
||||
if (op_name == "MaxPool") {
|
||||
// TODO: add version check or not? remove version check since only after version 8 would have storage_order, otherwise, it would be zero
|
||||
storage_order = info.GetAttrOrDefault<int64_t>("storage_order", 0 /*default_value*/);
|
||||
if (storage_order != 1) {
|
||||
layout_ = "NCWH";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<int64_t> kernel_shape_;
|
||||
std::vector<int64_t> pads_;
|
||||
std::vector<int64_t> strides_;
|
||||
std::string layout_ = "NCHW";
|
||||
bool count_include_pad_ = false;
|
||||
};
|
||||
|
||||
// A local macro to create Pool Ops
|
||||
|
||||
// helper macro defines Evaluate of of POOL_OP OpIRCreators
|
||||
#define POOL_OP(name) \
|
||||
Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \
|
||||
const tvm::Array<tvm::Tensor>& inputs, \
|
||||
const Node& node, \
|
||||
CodeGenContext&, \
|
||||
tvm::Array<tvm::Tensor>& outputs) { \
|
||||
if (outputs.size() > 1) ORT_NOT_IMPLEMENTED("output size = 2 is not implementated"); \
|
||||
FuncWithPoolingArgument argment(node, #name); \
|
||||
tvm::Tensor Y = name(inputs[0], ToTvmArray(argment.kernel_shape_), ToTvmArray(argment.strides_), ToTvmArray(argment.pads_), argment.layout_, argment.count_include_pad_); \
|
||||
outputs.push_back(Y); \
|
||||
return Status::OK(); \
|
||||
} // namespace tvm_codegen
|
||||
|
||||
POOL_OP(MaxPool)
|
||||
POOL_OP(AveragePool)
|
||||
|
||||
#undef POOL_OP
|
||||
|
||||
// helper macro defines Evaluate of of GlobalPOOL_OP OpIRCreators
|
||||
#define POOL_OP(name) \
|
||||
Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \
|
||||
const tvm::Array<tvm::Tensor>& inputs, \
|
||||
const Node& node, \
|
||||
CodeGenContext&, \
|
||||
tvm::Array<tvm::Tensor>& outputs) { \
|
||||
if (inputs[0]->shape.size() != 4) \
|
||||
ORT_NOT_IMPLEMENTED(gsl::narrow_cast<int64_t>(inputs[0]->shape.size()) - 2, "d global pooling is not implementated"); \
|
||||
tvm::Tensor Y = name(inputs[0], "NCHW"); \
|
||||
outputs.push_back(Y); \
|
||||
return Status::OK(); \
|
||||
#define POOL_OP(name) \
|
||||
Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \
|
||||
const tvm::Array<tvm::Tensor>& inputs, \
|
||||
const Node& node, \
|
||||
CodeGenContext& ctx_codegen, \
|
||||
tvm::Array<tvm::Tensor>& outputs) { \
|
||||
ORT_RETURN_IF_NOT(outputs.size() == 1, "multiple outputs are not supported yet!"); \
|
||||
ProtoHelperNodeContext ctx(node); \
|
||||
OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx); \
|
||||
int version = ctx_codegen.GetCodeGenHandle()->domain_version_lookup_func(node.Domain()); \
|
||||
PoolAttributes pool_attrs(info, #name, version); \
|
||||
for (auto n : pool_attrs.dilations) { \
|
||||
ORT_RETURN_IF_NOT(n <= 1, "dilations are not supported yet!"); \
|
||||
} \
|
||||
if (pool_attrs.global_pooling) { \
|
||||
if (inputs[0]->shape.size() != 4) { \
|
||||
ORT_NOT_IMPLEMENTED(gsl::narrow_cast<int64_t>(inputs[0]->shape.size()) - 2, "d global pooling is not implementated"); \
|
||||
} \
|
||||
} else { \
|
||||
if (pool_attrs.kernel_shape.size() != 2) { \
|
||||
ORT_NOT_IMPLEMENTED(pool_attrs.kernel_shape.size(), "d pooling is not implementated"); \
|
||||
} \
|
||||
} \
|
||||
tvm::Array<tvm::Expr> dummy_output_shape; \
|
||||
tvm::Tensor Y = name(inputs[0], pool_attrs, dummy_output_shape); \
|
||||
outputs.push_back(Y); \
|
||||
return Status::OK(); \
|
||||
}
|
||||
|
||||
POOL_OP(GlobalMaxPool)
|
||||
POOL_OP(GlobalAveragePool)
|
||||
LIST_POOL_OPS()
|
||||
|
||||
#undef POOL_OP
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@
|
|||
|
||||
#include "core/providers/nuphar/compiler/codegen_manager.h"
|
||||
|
||||
#include "core/codegen/common/op_macro.h"
|
||||
#include "core/codegen/passes/op_ir_creator/all_ops.h"
|
||||
#include "core/codegen/passes/scheduler/all_schedules.h"
|
||||
#include "core/codegen/passes/weight_layout/transpose_2d.h"
|
||||
|
|
@ -27,6 +26,7 @@ namespace nuphar {
|
|||
#define ADD_OP_ITEM(name) \
|
||||
op_ir_registry->Register(std::move(std::make_unique<NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(name)>()));
|
||||
|
||||
#define POOL_OP(OP) ADD_OP_ITEM(OP)
|
||||
#define REDUCE_V_OP(name) ADD_OP_ITEM(name)
|
||||
#define UNARY_OP(name) ADD_OP_ITEM(name)
|
||||
|
||||
|
|
@ -35,6 +35,7 @@ static void RegisterAllNupharX86OpIRCreators(tvm_codegen::OpIRRegistry* op_ir_re
|
|||
}
|
||||
|
||||
#undef ADD_OP_ITEM
|
||||
#undef POOL_OP
|
||||
#undef REDUCE_V_OP
|
||||
#undef UNARY_OP
|
||||
|
||||
|
|
@ -117,6 +118,7 @@ static void RegisterAllNupharWeightLayouts(tvm_codegen::WeightLayoutRegistry* la
|
|||
#define ADD_OP_ITEM(name) \
|
||||
dispatcher->Register(#name, registry->Get(NUPHAR_TVM_X86_OP_IR_CREATOR_STRING(name)));
|
||||
|
||||
#define POOL_OP(OP) ADD_OP_ITEM(OP)
|
||||
#define REDUCE_V_OP(name) ADD_OP_ITEM(name)
|
||||
#define UNARY_OP(name) ADD_OP_ITEM(name)
|
||||
|
||||
|
|
@ -128,6 +130,7 @@ static void RegisterNupharX86Dispatcher(const std::shared_ptr<tvm_codegen::TVMIR
|
|||
}
|
||||
|
||||
#undef ADD_OP_ITEM
|
||||
#undef POOL_OP
|
||||
#undef REDUCE_V_OP
|
||||
#undef UNARY_OP
|
||||
// END: Nuphar TVM X86 IR creator classes
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@
|
|||
|
||||
#include "core/providers/nuphar/compiler/nuphar_op_ir_builder.h"
|
||||
|
||||
#include "core/codegen/common/op_macro.h"
|
||||
#include "core/codegen/mti/mti_tvm_utils.h"
|
||||
#include "core/codegen/passes/op_ir_creator/all_ops.h"
|
||||
#include "core/codegen/passes/op_ir_creator/tvm_ir_builder.h"
|
||||
|
|
|
|||
|
|
@ -21,6 +21,12 @@ namespace nuphar {
|
|||
#define NUPHAR_TVM_X86_OP_IR_CREATOR_STRING(OP) \
|
||||
STRINGIZE(NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(OP))
|
||||
|
||||
#define LIST_X86_POOL_OPS() \
|
||||
POOL_OP(MaxPool) \
|
||||
POOL_OP(AveragePool) \
|
||||
POOL_OP(GlobalMaxPool) \
|
||||
POOL_OP(GlobalAveragePool)
|
||||
|
||||
#define LIST_X86_UNARY_OPS() \
|
||||
UNARY_OP(Erf) \
|
||||
UNARY_OP(Exp) \
|
||||
|
|
@ -39,6 +45,7 @@ namespace nuphar {
|
|||
|
||||
#define LIST_ALL_X86_OPS() \
|
||||
LIST_REDUCE_V_OPS() \
|
||||
LIST_X86_POOL_OPS() \
|
||||
LIST_X86_UNARY_OPS() \
|
||||
ADD_OP_ITEM(Gemm) \
|
||||
ADD_OP_ITEM(LogSoftmax) \
|
||||
|
|
@ -51,6 +58,7 @@ namespace nuphar {
|
|||
|
||||
// Define all OPs for NupharTVMX86
|
||||
#define ADD_OP_ITEM(OP) DECLARE_NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(OP)
|
||||
#define POOL_OP(OP) ADD_OP_ITEM(OP)
|
||||
#define REDUCE_V_OP(OP) ADD_OP_ITEM(OP)
|
||||
#define UNARY_OP(OP) ADD_OP_ITEM(OP)
|
||||
|
||||
|
|
@ -58,6 +66,7 @@ LIST_ALL_X86_OPS()
|
|||
|
||||
#undef ADD_OP_ITEM
|
||||
#undef REDUCE_V_OP
|
||||
#undef POOL_OP
|
||||
#undef UNARY_OP
|
||||
|
||||
} // namespace nuphar
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@
|
|||
|
||||
#include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h"
|
||||
|
||||
#include "core/codegen/common/op_macro.h"
|
||||
#include "core/framework/op_kernel_info.h"
|
||||
#include "core/providers/nuphar/mti_x86/math/unary_ops.h"
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,83 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h"
|
||||
|
||||
#include "core/codegen/passes/utils/ort_tvm_utils.h"
|
||||
#include "core/framework/op_kernel_info.h"
|
||||
#include "core/providers/cpu/nn/pool_attributes.h"
|
||||
#include "core/providers/nuphar/mti_x86/nn/pool_ops.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace nuphar {
|
||||
|
||||
static tvm::Array<tvm::Expr> GetOutputShapeAndPads(const Node& node,
|
||||
PoolAttributes& pool_attrs,
|
||||
tvm_codegen::CodeGenContext& ctx_codegen) {
|
||||
const NodeArg* input = node.InputDefs()[0];
|
||||
ORT_ENFORCE(input);
|
||||
const ONNX_NAMESPACE::TensorShapeProto* shape_proto = input->Shape();
|
||||
size_t num_input_dims = shape_proto->dim_size();
|
||||
ORT_ENFORCE(num_input_dims >= 2);
|
||||
|
||||
tvm::Array<tvm::Expr> output_shape;
|
||||
// batch dimenion
|
||||
output_shape.push_back(ShapeDimToTvmDim(shape_proto->dim(0), ctx_codegen));
|
||||
// output channel
|
||||
output_shape.push_back(ShapeDimToTvmDim(shape_proto->dim(1), ctx_codegen));
|
||||
|
||||
size_t kernel_sz = pool_attrs.kernel_shape.size();
|
||||
if (pool_attrs.global_pooling) {
|
||||
pool_attrs.pads.assign(kernel_sz, 0);
|
||||
// skip batch and channel dimensions, so dim starts from 2
|
||||
for (size_t dim = 2; dim < num_input_dims; dim++) {
|
||||
output_shape.push_back(tvm::make_const(tvm::Int(32), 1));
|
||||
}
|
||||
} else {
|
||||
ORT_ENFORCE(num_input_dims > kernel_sz);
|
||||
size_t kernel_idx_offset = num_input_dims - kernel_sz;
|
||||
for (size_t dim = 0; dim < kernel_sz; dim++) {
|
||||
// TODO: handle symbolic dimensions
|
||||
ORT_ENFORCE(ShapeHasValue(input, dim + kernel_idx_offset));
|
||||
int64_t dim_val = ShapeValue(input, dim + kernel_idx_offset);
|
||||
int64_t dim_size = 0;
|
||||
pool_attrs.ComputeSizePadDilations(static_cast<int>(dim_val),
|
||||
pool_attrs.strides[dim],
|
||||
pool_attrs.kernel_shape[dim],
|
||||
&(pool_attrs.pads[dim]),
|
||||
&(pool_attrs.pads[kernel_sz + dim]),
|
||||
pool_attrs.dilations[dim],
|
||||
&dim_size);
|
||||
output_shape.push_back(tvm::make_const(tvm::Int(32), dim_size));
|
||||
}
|
||||
}
|
||||
return output_shape;
|
||||
}
|
||||
|
||||
#define POOL_OP(name) \
|
||||
Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(name)::Evaluate( \
|
||||
const tvm::Array<tvm::Tensor>& inputs, \
|
||||
const Node& node, \
|
||||
tvm_codegen::CodeGenContext& ctx_codegen, \
|
||||
tvm::Array<tvm::Tensor>& outputs) { \
|
||||
ORT_RETURN_IF_NOT(node.OutputDefs().size() == 1, " multiple outputs are not supported yet!"); \
|
||||
ORT_RETURN_IF_NOT(inputs[0]->dtype == HalideIR::Float(32), " non-float32 not supported yet"); \
|
||||
ProtoHelperNodeContext ctx(node); \
|
||||
OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx); \
|
||||
int version = ctx_codegen.GetCodeGenHandle()->domain_version_lookup_func(node.Domain()); \
|
||||
PoolAttributes pool_attrs(info, #name, version); \
|
||||
for (auto n : pool_attrs.dilations) { \
|
||||
ORT_RETURN_IF_NOT(n <= 1, "dilations are not supported yet!"); \
|
||||
} \
|
||||
tvm::Array<tvm::Expr> output_shape = GetOutputShapeAndPads(node, pool_attrs, ctx_codegen); \
|
||||
tvm::Tensor Y = name(inputs[0], pool_attrs, output_shape); \
|
||||
outputs.push_back(Y); \
|
||||
return Status::OK(); \
|
||||
} \
|
||||
|
||||
LIST_X86_POOL_OPS()
|
||||
|
||||
#undef POOL_OP
|
||||
|
||||
} // namespace nuphar
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -77,7 +77,8 @@ class NupharKernelState {
|
|||
NUPHAR_OP(Add, 7, DataTypeImpl::AllFixedSizeTensorTypes()) \
|
||||
NUPHAR_OP(ArgMax, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \
|
||||
NUPHAR_OP(ArgMin, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \
|
||||
DISABLE_MACRO(NUPHAR_OP(AveragePool, 7, DataTypeImpl::AllFixedSizeTensorTypes())) \
|
||||
NUPHAR_VERSIONED_OP(AveragePool, 7, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
|
||||
NUPHAR_OP(AveragePool, 10, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
|
||||
NUPHAR_OP(Ceil, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \
|
||||
NUPHAR_OP(Clip, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \
|
||||
NUPHAR_OP(Concat, 4, DataTypeImpl::AllFixedSizeTensorTypes()) \
|
||||
|
|
@ -94,8 +95,8 @@ class NupharKernelState {
|
|||
NUPHAR_OP(Floor, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \
|
||||
NUPHAR_VERSIONED_OP(Gemm, 7, 8, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
|
||||
NUPHAR_OP(Gemm, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
|
||||
DISABLE_MACRO(NUPHAR_OP(GlobalAveragePool, 1, DataTypeImpl::AllFixedSizeTensorTypes())) \
|
||||
DISABLE_MACRO(NUPHAR_OP(GlobalMaxPool, 1, DataTypeImpl::AllFixedSizeTensorTypes())) \
|
||||
NUPHAR_OP(GlobalAveragePool, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
|
||||
NUPHAR_OP(GlobalMaxPool, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
|
||||
NUPHAR_OP(Greater, 9, DataTypeImpl::AllFixedSizeTensorTypes()) \
|
||||
NUPHAR_OP(HardSigmoid, 6, DataTypeImpl::AllIEEEFloatTensorTypes()) \
|
||||
NUPHAR_OP(Identity, 1, DataTypeImpl::AllFixedSizeTensorTypes()) \
|
||||
|
|
@ -107,8 +108,9 @@ class NupharKernelState {
|
|||
NUPHAR_VERSIONED_OP(MatMul, 1, 8, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
|
||||
NUPHAR_OP(MatMul, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
|
||||
NUPHAR_OP(Max, 8, DataTypeImpl::AllFixedSizeTensorTypes()) \
|
||||
DISABLE_MACRO(NUPHAR_VERSIONED_OP(MaxPool, 1, 7, DataTypeImpl::AllFixedSizeTensorTypes())) \
|
||||
DISABLE_MACRO(NUPHAR_OP(MaxPool, 8, DataTypeImpl::AllFixedSizeTensorTypes())) \
|
||||
NUPHAR_VERSIONED_OP(MaxPool, 1, 7, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
|
||||
NUPHAR_VERSIONED_OP(MaxPool, 8, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
|
||||
NUPHAR_OP(MaxPool, 10, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes()) \
|
||||
NUPHAR_OP(Min, 8, DataTypeImpl::AllFixedSizeTensorTypes()) \
|
||||
NUPHAR_OP(Mul, 7, DataTypeImpl::AllFixedSizeTensorTypes()) \
|
||||
NUPHAR_OP(Neg, 6, DataTypeImpl::AllFixedSizeTensorTypes()) \
|
||||
|
|
|
|||
176
onnxruntime/core/providers/nuphar/mti_x86/nn/pool_ops.cc
Normal file
176
onnxruntime/core/providers/nuphar/mti_x86/nn/pool_ops.cc
Normal file
|
|
@ -0,0 +1,176 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "core/providers/nuphar/mti_x86/nn/pool_ops.h"
|
||||
|
||||
#include "core/codegen/mti/mti_tvm_utils.h"
|
||||
#include "core/mlas/inc/mlas.h"
|
||||
#include "core/providers/cpu/nn/pool_attributes.h"
|
||||
#include <topi/detail/extern.h>
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace nuphar {
|
||||
|
||||
TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.pool_f32")
|
||||
.set_body([](tvm::TVMArgs args, tvm::TVMRetValue* /*ret*/) {
|
||||
// input
|
||||
DLTensor* X = args[0];
|
||||
DCHECK(tvm::runtime::TypeMatch(X->dtype, kDLFloat, 32));
|
||||
// output
|
||||
DLTensor* Y = args[1];
|
||||
DCHECK(tvm::runtime::TypeMatch(Y->dtype, kDLFloat, 32));
|
||||
|
||||
// enum is not an integral type
|
||||
int k = args[2];
|
||||
MLAS_POOLING_KIND kind = static_cast<MLAS_POOLING_KIND>(k);
|
||||
|
||||
int num_args = args.size();
|
||||
DCHECK(num_args > 3);
|
||||
int arg_idx = 3;
|
||||
|
||||
auto extract_values_fn = [&]() {
|
||||
std::vector<int64_t> vec;
|
||||
|
||||
DCHECK(arg_idx < num_args);
|
||||
int64_t num_vec = args[arg_idx++];
|
||||
for (int i = 0; i < num_vec; i++, arg_idx++) {
|
||||
DCHECK(arg_idx < num_args);
|
||||
int64_t v = args[arg_idx];
|
||||
vec.push_back(v);
|
||||
}
|
||||
return vec;
|
||||
};
|
||||
|
||||
std::vector<int64_t> kernel_shape = extract_values_fn();
|
||||
std::vector<int64_t> padding = extract_values_fn();
|
||||
std::vector<int64_t> strides = extract_values_fn();
|
||||
|
||||
MlasPool(kind,
|
||||
/*num_pooling_dims*/ kernel_shape.size(),
|
||||
/*input_shape*/ X->shape,
|
||||
kernel_shape.data(),
|
||||
padding.data(),
|
||||
strides.data(),
|
||||
/*output_shape*/ Y->shape,
|
||||
reinterpret_cast<float*>(static_cast<char*>(X->data) + X->byte_offset),
|
||||
reinterpret_cast<float*>(static_cast<char*>(Y->data) + Y->byte_offset),
|
||||
/*thread_pool*/ nullptr);
|
||||
});
|
||||
|
||||
TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.global_pool_f32")
|
||||
.set_body([](tvm::TVMArgs args, tvm::TVMRetValue* /*ret*/) {
|
||||
// input
|
||||
DLTensor* X = args[0];
|
||||
DCHECK(tvm::runtime::TypeMatch(X->dtype, kDLFloat, 32));
|
||||
// output
|
||||
DLTensor* Y = args[1];
|
||||
DCHECK(tvm::runtime::TypeMatch(Y->dtype, kDLFloat, 32));
|
||||
|
||||
// enum is not an integral type
|
||||
int k = args[2];
|
||||
MLAS_POOLING_KIND kind = static_cast<MLAS_POOLING_KIND>(k);
|
||||
|
||||
MlasPool(kind,
|
||||
/*num_pooling_dims*/ X->ndim - 2,
|
||||
/*input_shape*/ X->shape,
|
||||
/*kernel_shape*/ nullptr,
|
||||
/*padding*/ nullptr,
|
||||
/*strides*/ nullptr,
|
||||
/*output_shape*/ Y->shape,
|
||||
reinterpret_cast<float*>(static_cast<char*>(X->data) + X->byte_offset),
|
||||
reinterpret_cast<float*>(static_cast<char*>(Y->data) + Y->byte_offset),
|
||||
/*thread_pool*/ nullptr);
|
||||
});
|
||||
|
||||
static tvm::Tensor MakeGlobalPoolCommon(const tvm::Tensor& X,
|
||||
const MLAS_POOLING_KIND kind,
|
||||
const tvm::Array<tvm::Expr>& output_shape,
|
||||
const std::string& name) {
|
||||
return topi::detail::make_extern(
|
||||
/*output_shapes*/ {output_shape},
|
||||
/*output_types*/ {X->dtype},
|
||||
/*inputs*/ {X},
|
||||
[&](tvm::Array<tvm::Buffer> ins, tvm::Array<tvm::Buffer> outs) {
|
||||
return topi::detail::call_packed({tvm::Expr("tvm.contrib.onnxruntime.global_pool_f32"),
|
||||
topi::detail::pack_buffer(ins[0]),
|
||||
topi::detail::pack_buffer(outs[0]),
|
||||
static_cast<int>(kind)});
|
||||
},
|
||||
name, /*tag*/ "", /*attrs*/ {})[0];
|
||||
}
|
||||
|
||||
static tvm::Tensor MakePoolCommon(const tvm::Tensor& X,
|
||||
const PoolAttributes& pool_attrs,
|
||||
const MLAS_POOLING_KIND kind,
|
||||
const tvm::Array<tvm::Expr>& output_shape,
|
||||
const std::string& name) {
|
||||
size_t num_input_dims = X.ndim();
|
||||
ORT_ENFORCE(num_input_dims >= 3, "Input dimension must be >= 3");
|
||||
size_t num_pooling_dims = num_input_dims - 2;
|
||||
ORT_ENFORCE(num_pooling_dims <= 3, "pooling size must be <= 3");
|
||||
ORT_ENFORCE(num_pooling_dims == pool_attrs.kernel_shape.size(),
|
||||
"kernel_shape num_dims is not compatible with X num_dims.");
|
||||
|
||||
tvm::Array<tvm::Expr> pooling_args;
|
||||
auto add_args_fn = [&](const std::vector<int64_t>& v) {
|
||||
pooling_args.push_back(tvm::make_const(tvm::Int(64), static_cast<int64_t>(v.size())));
|
||||
for (auto n : v) {
|
||||
pooling_args.push_back(tvm::make_const(tvm::Int(64), n));
|
||||
}
|
||||
};
|
||||
add_args_fn(pool_attrs.kernel_shape);
|
||||
add_args_fn(pool_attrs.pads);
|
||||
add_args_fn(pool_attrs.strides);
|
||||
|
||||
return topi::detail::make_extern(
|
||||
/*output_shapes*/ {output_shape},
|
||||
/*output_types*/ {X->dtype},
|
||||
/*inputs*/ {X},
|
||||
[&](tvm::Array<tvm::Buffer> ins, tvm::Array<tvm::Buffer> outs) {
|
||||
tvm::Array<tvm::Expr> args = {tvm::Expr("tvm.contrib.onnxruntime.pool_f32"),
|
||||
topi::detail::pack_buffer(ins[0]),
|
||||
topi::detail::pack_buffer(outs[0]),
|
||||
static_cast<int>(kind)};
|
||||
// kernel_shape, padds and strides are directly passed into the external function
|
||||
for (size_t i = 0; i < pooling_args.size(); i++) {
|
||||
args.push_back(pooling_args[i]);
|
||||
}
|
||||
return topi::detail::call_packed(args);
|
||||
},
|
||||
name, /*tag*/ "", /*attrs*/ {})[0];
|
||||
}
|
||||
|
||||
tvm::Tensor AveragePool(const tvm::Tensor& X,
|
||||
const PoolAttributes& pool_attrs,
|
||||
const tvm::Array<tvm::Expr>& output_shape,
|
||||
const std::string& name) {
|
||||
MLAS_POOLING_KIND kind = pool_attrs.count_include_pad ? MlasAveragePoolingIncludePad
|
||||
: MlasAveragePoolingExcludePad;
|
||||
return MakePoolCommon(X, pool_attrs, kind, output_shape, name);
|
||||
}
|
||||
|
||||
tvm::Tensor GlobalAveragePool(const tvm::Tensor& X,
|
||||
const PoolAttributes& pool_attrs,
|
||||
const tvm::Array<tvm::Expr>& output_shape,
|
||||
const std::string& name) {
|
||||
MLAS_POOLING_KIND kind = pool_attrs.count_include_pad ? MlasAveragePoolingIncludePad
|
||||
: MlasAveragePoolingExcludePad;
|
||||
return MakeGlobalPoolCommon(X, kind, output_shape, name);
|
||||
}
|
||||
|
||||
tvm::Tensor MaxPool(const tvm::Tensor& X,
|
||||
const PoolAttributes& pool_attrs,
|
||||
const tvm::Array<tvm::Expr>& output_shape,
|
||||
const std::string& name) {
|
||||
return MakePoolCommon(X, pool_attrs, MlasMaximumPooling, output_shape, name);
|
||||
}
|
||||
|
||||
tvm::Tensor GlobalMaxPool(const tvm::Tensor& X,
|
||||
const PoolAttributes& /*pool_attrs*/,
|
||||
const tvm::Array<tvm::Expr>& output_shape,
|
||||
const std::string& name) {
|
||||
return MakeGlobalPoolCommon(X, MlasMaximumPooling, output_shape, name);
|
||||
}
|
||||
|
||||
} // namespace nuphar
|
||||
} // namespace onnxruntime
|
||||
37
onnxruntime/core/providers/nuphar/mti_x86/nn/pool_ops.h
Normal file
37
onnxruntime/core/providers/nuphar/mti_x86/nn/pool_ops.h
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <tvm/tvm.h>
|
||||
|
||||
namespace onnxruntime {
|
||||
|
||||
// Forward declaration
|
||||
struct PoolAttributes;
|
||||
|
||||
namespace nuphar {
|
||||
|
||||
tvm::Tensor AveragePool(const tvm::Tensor& X,
|
||||
const PoolAttributes& pool_attrs,
|
||||
const tvm::Array<tvm::Expr>& output_shape,
|
||||
const std::string& name = "average_pool");
|
||||
|
||||
tvm::Tensor GlobalAveragePool(const tvm::Tensor& X,
|
||||
const PoolAttributes& pool_attrs,
|
||||
const tvm::Array<tvm::Expr>& output_shape,
|
||||
const std::string& name = "global_average_pool");
|
||||
|
||||
tvm::Tensor MaxPool(const tvm::Tensor& X,
|
||||
const PoolAttributes& pool_attrs,
|
||||
const tvm::Array<tvm::Expr>& output_shape,
|
||||
const std::string& name = "max_pool");
|
||||
|
||||
tvm::Tensor GlobalMaxPool(const tvm::Tensor& X,
|
||||
const PoolAttributes& pool_attrs,
|
||||
const tvm::Array<tvm::Expr>& output_shape,
|
||||
const std::string& name = "global_max_pool");
|
||||
|
||||
} // namespace nuphar
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -193,6 +193,23 @@ NupharExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie
|
|||
if (node.OpType() == "Tile" && !graph_viewer.IsConstantInitializer(inputs[1]->Name(), true))
|
||||
return false; // do not support tile that has dynamic repeats
|
||||
|
||||
if (node.OpType() == "MaxPool") {
|
||||
// TODO: enable support for Indices
|
||||
if (node.OutputDefs().size() > 1) {
|
||||
return false;
|
||||
}
|
||||
// TODO: enable support for non-default dilations
|
||||
const onnxruntime::NodeAttributes& attrs = node.GetAttributes();
|
||||
auto it = attrs.find("dilations");
|
||||
if (it != attrs.end()) {
|
||||
for (int i = 0; i < it->second.ints_size(); i++) {
|
||||
if (it->second.ints(i) > 1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (node.OpType() == "Slice") {
|
||||
auto num_inputs = inputs.size();
|
||||
ORT_ENFORCE(num_inputs > 0);
|
||||
|
|
|
|||
Loading…
Reference in a new issue