use MLAS for nuphar's pool ops (#1937)

* call MLAS's pooling function as an external call for Nuphar Note that at the moment Nuphar provider doesn't handle the cases below: - symbolic height/weight dimensions - Indices output of MaxPool - non-default dilations * unify the pool interface for mti and mti_x86
2026-07-17 18:40:28 +00:00 · 2019-09-26 16:29:30 -07:00 · 2019-09-26 16:29:30 -07:00 · 650fb8754b
commit 650fb8754b
parent c3ffd1f47d
12 changed files with 417 additions and 125 deletions
--- a/onnxruntime/core/codegen/mti/nn/pool_ops.cc
+++ b/onnxruntime/core/codegen/mti/nn/pool_ops.cc
@ -3,6 +3,9 @@

 #include "core/codegen/mti/nn/pool_ops.h"

+#include "core/codegen/mti/mti_tvm_utils.h"
+#include "core/mlas/inc/mlas.h"
+#include "core/providers/cpu/nn/pool_attributes.h"
 #include <topi/nn/pooling.h>

 namespace onnxruntime {
@ -10,48 +13,50 @@ namespace tvm_codegen {

 // TODO: topi only support 2d-pool, MaxPool1d and MaxPool3d will need to be added if necessary.
 // only support version < 8 for topi doesn't come with implementation to output index tensor
-tvm::Tensor MaxPool(
-    const tvm::Tensor& input,
-    const tvm::Array<tvm::Expr>& kernel_size,
-    const tvm::Array<tvm::Expr>& stride_size,
-    const tvm::Array<tvm::Expr>& padding_size,
-    const std::string& layout,
-    bool count_include_pad) {
-  return topi::nn::pool(input, kernel_size, stride_size, padding_size,
-                        topi::nn::kMaxPool,
-                        false,
-                        layout,
-                        count_include_pad);
+tvm::Tensor MaxPool(const tvm::Tensor& input,
+                    const PoolAttributes& pool_attrs,
+                    const tvm::Array<tvm::Expr>& /*output_shape*/,
+                    const std::string& /*name*/) {
+  return topi::nn::pool(input,
+                        ToTvmArray(pool_attrs.kernel_shape),
+                        ToTvmArray(pool_attrs.strides),
+                        ToTvmArray(pool_attrs.pads),
+                        /*pool_type*/ topi::nn::kMaxPool,
+                        /*ceil_mode*/ false,
+                        /*layout*/ pool_attrs.storage_order == 0 ? "NCWH" : "NCHW",
+                        pool_attrs.count_include_pad);
 }

-tvm::Tensor AveragePool(
-    const tvm::Tensor& input,
-    const tvm::Array<tvm::Expr>& kernel_size,
-    const tvm::Array<tvm::Expr>& stride_size,
-    const tvm::Array<tvm::Expr>& padding_size,
-    const std::string& layout,
-    bool count_include_pad) {
-  return topi::nn::pool(input, kernel_size, stride_size, padding_size,
-                        topi::nn::kAvgPool,
-                        false,
-                        layout,
-                        count_include_pad);
+tvm::Tensor AveragePool(const tvm::Tensor& input,
+                        const PoolAttributes& pool_attrs,
+                        const tvm::Array<tvm::Expr>& /*output_shape*/,
+                        const std::string& /*name*/) {
+  return topi::nn::pool(input,
+                        ToTvmArray(pool_attrs.kernel_shape),
+                        ToTvmArray(pool_attrs.strides),
+                        ToTvmArray(pool_attrs.pads),
+                        /*pool_type*/ topi::nn::kAvgPool,
+                        /*ceil_mode*/ false,
+                        /*layout*/ "NCHW",
+                        pool_attrs.count_include_pad);
 }

-tvm::Tensor GlobalMaxPool(
-    const tvm::Tensor& input,
-    const std::string& layout) {
+tvm::Tensor GlobalMaxPool(const tvm::Tensor& input,
+                          const PoolAttributes& /*pool_attrs*/,
+                          const tvm::Array<tvm::Expr>& /*output_shape*/,
+                          const std::string& /*name*/) {
  return topi::nn::global_pool(input,
-                               topi::nn::kMaxPool,
-                               layout);
+                               /*pool_type*/ topi::nn::kMaxPool,
+                               /*layout*/ "NCHW");
 }

-tvm::Tensor GlobalAveragePool(
-    const tvm::Tensor& input,
-    const std::string& layout) {
+tvm::Tensor GlobalAveragePool(const tvm::Tensor& input,
+                              const PoolAttributes& /*pool_attrs*/,
+                              const tvm::Array<tvm::Expr>& /*output_shape*/,
+                              const std::string& /*name*/) {
  return topi::nn::global_pool(input,
-                               topi::nn::kAvgPool,
-                               layout);
+                               /*pool_type*/ topi::nn::kAvgPool,
+                               /*layout*/ "NCHW");
 }

 }  // namespace tvm_codegen
--- a/onnxruntime/core/codegen/mti/nn/pool_ops.h
+++ b/onnxruntime/core/codegen/mti/nn/pool_ops.h
@ -6,27 +6,31 @@
 #include <tvm/tvm.h>

 namespace onnxruntime {
+
+// Forward declaration
+struct PoolAttributes;
+
 namespace tvm_codegen {

 tvm::Tensor MaxPool(const tvm::Tensor& input,
-                    const tvm::Array<tvm::Expr>& kernel_size,
-                    const tvm::Array<tvm::Expr>& stride_size,
-                    const tvm::Array<tvm::Expr>& padding_size,
-                    const std::string& layout,
-                    bool count_include_pad);
+                    const PoolAttributes& pool_attrs,
+                    const tvm::Array<tvm::Expr>& output_shape,
+                    const std::string& name = "max_pool");

 tvm::Tensor AveragePool(const tvm::Tensor& input,
-                        const tvm::Array<tvm::Expr>& kernel_size,
-                        const tvm::Array<tvm::Expr>& stride_size,
-                        const tvm::Array<tvm::Expr>& padding_size,
-                        const std::string& layout,
-                        bool count_include_pad);
+                        const PoolAttributes& pool_attrs,
+                        const tvm::Array<tvm::Expr>& output_shape,
+                        const std::string& name = "average_pool");

 tvm::Tensor GlobalMaxPool(const tvm::Tensor& input,
-                          const std::string& layout);
+                          const PoolAttributes& pool_attrs,
+                          const tvm::Array<tvm::Expr>& output_shape,
+                          const std::string& name = "global_max_pool");

 tvm::Tensor GlobalAveragePool(const tvm::Tensor& input,
-                              const std::string& layout);
+                              const PoolAttributes& pool_attrs,
+                              const tvm::Array<tvm::Expr>& output_shape,
+                              const std::string& name = "global_average_pool");

 }  // namespace tvm_codegen
 }  // namespace onnxruntime
--- a/onnxruntime/core/codegen/passes/op_ir_creator/nn/pool_ops.cc
+++ b/onnxruntime/core/codegen/passes/op_ir_creator/nn/pool_ops.cc
@ -6,86 +6,44 @@
 #include "core/codegen/mti/mti_tvm_utils.h"
 #include "core/codegen/mti/nn/pool_ops.h"
 #include "core/framework/op_kernel_info.h"
+#include "core/providers/cpu/nn/pool_attributes.h"

 namespace onnxruntime {
 namespace tvm_codegen {

-// helper class for pool_ops with arguments
-class FuncWithPoolingArgument {
- public:
-  FuncWithPoolingArgument(const Node& node, const std::string& op_name) {
-    ProtoHelperNodeContext ctx(node);
-    OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);
-    int64_t storage_order{0};  // MaxPool_8 only. 0 is row major, and 1 is column major. Default is 0.
-
-    ORT_ENFORCE(info.GetAttrs<int64_t>("kernel_shape", kernel_shape_).IsOK(), "No kernel shape is set.");
-    if (kernel_shape_.size() != 2)
-      ORT_NOT_IMPLEMENTED(kernel_shape_.size(), "d pooling is not implementated");
-    if (!info.GetAttrs<int64_t>("pads", pads_).IsOK() || pads_.empty()) {
-      pads_.resize(kernel_shape_.size() * 2, 0);
-    }
-    if (!info.GetAttrs<int64_t>("strides", strides_).IsOK() || strides_.empty()) {
-      strides_.resize(kernel_shape_.size(), 1);
-    }
-    if (op_name == "AveragePool") {
-      int64_t temp;
-      ORT_ENFORCE(info.GetAttr<int64_t>("count_include_pad", &temp).IsOK());
-      count_include_pad_ = (temp != 0);
-    }
-
-    if (op_name == "MaxPool") {
-      // TODO: add version check or not? remove version check since only after version 8 would have storage_order, otherwise, it would be zero
-      storage_order = info.GetAttrOrDefault<int64_t>("storage_order", 0 /*default_value*/);
-      if (storage_order != 1) {
-        layout_ = "NCWH";
-      }
-    }
-  }
-
-  std::vector<int64_t> kernel_shape_;
-  std::vector<int64_t> pads_;
-  std::vector<int64_t> strides_;
-  std::string layout_ = "NCHW";
-  bool count_include_pad_ = false;
-};
-
 // A local macro to create Pool Ops

 // helper macro defines Evaluate of of POOL_OP OpIRCreators
-#define POOL_OP(name)                                                                                                                                                         \
-  Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate(                                                                                                                         \
-      const tvm::Array<tvm::Tensor>& inputs,                                                                                                                                  \
-      const Node& node,                                                                                                                                                       \
-      CodeGenContext&,                                                                                                                                                        \
-      tvm::Array<tvm::Tensor>& outputs) {                                                                                                                                     \
-    if (outputs.size() > 1) ORT_NOT_IMPLEMENTED("output size = 2 is not implementated");                                                                                      \
-    FuncWithPoolingArgument argment(node, #name);                                                                                                                             \
-    tvm::Tensor Y = name(inputs[0], ToTvmArray(argment.kernel_shape_), ToTvmArray(argment.strides_), ToTvmArray(argment.pads_), argment.layout_, argment.count_include_pad_); \
-    outputs.push_back(Y);                                                                                                                                                     \
-    return Status::OK();                                                                                                                                                      \
-  }  // namespace tvm_codegen
-
-POOL_OP(MaxPool)
-POOL_OP(AveragePool)
-
-#undef POOL_OP
-
-// helper macro defines Evaluate of of GlobalPOOL_OP OpIRCreators
-#define POOL_OP(name)                                                                                                       \
-  Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate(                                                                       \
-      const tvm::Array<tvm::Tensor>& inputs,                                                                                \
-      const Node& node,                                                                                                     \
-      CodeGenContext&,                                                                                                      \
-      tvm::Array<tvm::Tensor>& outputs) {                                                                                   \
-    if (inputs[0]->shape.size() != 4)                                                                                       \
-      ORT_NOT_IMPLEMENTED(gsl::narrow_cast<int64_t>(inputs[0]->shape.size()) - 2, "d global pooling is not implementated"); \
-    tvm::Tensor Y = name(inputs[0], "NCHW");                                                                                \
-    outputs.push_back(Y);                                                                                                   \
-    return Status::OK();                                                                                                    \
+#define POOL_OP(name)                                                                                                         \
+  Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate(                                                                         \
+      const tvm::Array<tvm::Tensor>& inputs,                                                                                  \
+      const Node& node,                                                                                                       \
+      CodeGenContext& ctx_codegen,                                                                                            \
+      tvm::Array<tvm::Tensor>& outputs) {                                                                                     \
+    ORT_RETURN_IF_NOT(outputs.size() == 1, "multiple outputs are not supported yet!");                                        \
+    ProtoHelperNodeContext ctx(node);                                                                                         \
+    OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);                                                                     \
+    int version = ctx_codegen.GetCodeGenHandle()->domain_version_lookup_func(node.Domain());                                  \
+    PoolAttributes pool_attrs(info, #name, version);                                                                          \
+    for (auto n : pool_attrs.dilations) {                                                                                     \
+      ORT_RETURN_IF_NOT(n <= 1, "dilations are not supported yet!");                                                          \
+    }                                                                                                                         \
+    if (pool_attrs.global_pooling) {                                                                                          \
+      if (inputs[0]->shape.size() != 4) {                                                                                     \
+        ORT_NOT_IMPLEMENTED(gsl::narrow_cast<int64_t>(inputs[0]->shape.size()) - 2, "d global pooling is not implementated"); \
+      }                                                                                                                       \
+    } else {                                                                                                                  \
+      if (pool_attrs.kernel_shape.size() != 2) {                                                                              \
+        ORT_NOT_IMPLEMENTED(pool_attrs.kernel_shape.size(), "d pooling is not implementated");                                \
+      }                                                                                                                       \
+    }                                                                                                                         \
+    tvm::Array<tvm::Expr> dummy_output_shape;                                                                                 \
+    tvm::Tensor Y = name(inputs[0], pool_attrs, dummy_output_shape);                                                          \
+    outputs.push_back(Y);                                                                                                     \
+    return Status::OK();                                                                                                      \
  }

-POOL_OP(GlobalMaxPool)
-POOL_OP(GlobalAveragePool)
+LIST_POOL_OPS()

 #undef POOL_OP

--- a/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/codegen_manager.cc
@ -3,7 +3,6 @@

 #include "core/providers/nuphar/compiler/codegen_manager.h"

-#include "core/codegen/common/op_macro.h"
 #include "core/codegen/passes/op_ir_creator/all_ops.h"
 #include "core/codegen/passes/scheduler/all_schedules.h"
 #include "core/codegen/passes/weight_layout/transpose_2d.h"
@ -27,6 +26,7 @@ namespace nuphar {
 #define ADD_OP_ITEM(name) \
  op_ir_registry->Register(std::move(std::make_unique<NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(name)>()));

+#define POOL_OP(OP) ADD_OP_ITEM(OP)
 #define REDUCE_V_OP(name) ADD_OP_ITEM(name)
 #define UNARY_OP(name) ADD_OP_ITEM(name)

@ -35,6 +35,7 @@ static void RegisterAllNupharX86OpIRCreators(tvm_codegen::OpIRRegistry* op_ir_re
 }

 #undef ADD_OP_ITEM
+#undef POOL_OP
 #undef REDUCE_V_OP
 #undef UNARY_OP

@ -117,6 +118,7 @@ static void RegisterAllNupharWeightLayouts(tvm_codegen::WeightLayoutRegistry* la
 #define ADD_OP_ITEM(name) \
  dispatcher->Register(#name, registry->Get(NUPHAR_TVM_X86_OP_IR_CREATOR_STRING(name)));

+#define POOL_OP(OP) ADD_OP_ITEM(OP)
 #define REDUCE_V_OP(name) ADD_OP_ITEM(name)
 #define UNARY_OP(name) ADD_OP_ITEM(name)

@ -128,6 +130,7 @@ static void RegisterNupharX86Dispatcher(const std::shared_ptr<tvm_codegen::TVMIR
 }

 #undef ADD_OP_ITEM
+#undef POOL_OP
 #undef REDUCE_V_OP
 #undef UNARY_OP
 // END: Nuphar TVM X86 IR creator classes
--- a/onnxruntime/core/providers/nuphar/compiler/nuphar_op_ir_builder.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/nuphar_op_ir_builder.cc
@ -3,7 +3,6 @@

 #include "core/providers/nuphar/compiler/nuphar_op_ir_builder.h"

-#include "core/codegen/common/op_macro.h"
 #include "core/codegen/mti/mti_tvm_utils.h"
 #include "core/codegen/passes/op_ir_creator/all_ops.h"
 #include "core/codegen/passes/op_ir_creator/tvm_ir_builder.h"
--- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h
@ -21,6 +21,12 @@ namespace nuphar {
 #define NUPHAR_TVM_X86_OP_IR_CREATOR_STRING(OP) \
  STRINGIZE(NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(OP))

+#define LIST_X86_POOL_OPS()  \
+  POOL_OP(MaxPool)           \
+  POOL_OP(AveragePool)       \
+  POOL_OP(GlobalMaxPool)     \
+  POOL_OP(GlobalAveragePool)
+
 #define LIST_X86_UNARY_OPS()   \
  UNARY_OP(Erf)                \
  UNARY_OP(Exp)                \
@ -39,6 +45,7 @@ namespace nuphar {

 #define LIST_ALL_X86_OPS()     \
  LIST_REDUCE_V_OPS()          \
+  LIST_X86_POOL_OPS()          \
  LIST_X86_UNARY_OPS()         \
  ADD_OP_ITEM(Gemm)            \
  ADD_OP_ITEM(LogSoftmax)      \
@ -51,6 +58,7 @@ namespace nuphar {

 // Define all OPs for NupharTVMX86
 #define ADD_OP_ITEM(OP) DECLARE_NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(OP)
+#define POOL_OP(OP) ADD_OP_ITEM(OP)
 #define REDUCE_V_OP(OP) ADD_OP_ITEM(OP)
 #define UNARY_OP(OP) ADD_OP_ITEM(OP)

@ -58,6 +66,7 @@ LIST_ALL_X86_OPS()

 #undef ADD_OP_ITEM
 #undef REDUCE_V_OP
+#undef POOL_OP
 #undef UNARY_OP

 }  // namespace nuphar
--- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/unary_ops.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/math/unary_ops.cc
@ -3,7 +3,6 @@

 #include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h"

-#include "core/codegen/common/op_macro.h"
 #include "core/framework/op_kernel_info.h"
 #include "core/providers/nuphar/mti_x86/math/unary_ops.h"

--- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/nn/pool_ops.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/nn/pool_ops.cc
@ -0,0 +1,83 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/nuphar/compiler/x86/op_ir_creator/all_ops.h"
+
+#include "core/codegen/passes/utils/ort_tvm_utils.h"
+#include "core/framework/op_kernel_info.h"
+#include "core/providers/cpu/nn/pool_attributes.h"
+#include "core/providers/nuphar/mti_x86/nn/pool_ops.h"
+
+namespace onnxruntime {
+namespace nuphar {
+
+static tvm::Array<tvm::Expr> GetOutputShapeAndPads(const Node& node,
+                                                   PoolAttributes& pool_attrs,
+                                                   tvm_codegen::CodeGenContext& ctx_codegen) {
+  const NodeArg* input = node.InputDefs()[0];
+  ORT_ENFORCE(input);
+  const ONNX_NAMESPACE::TensorShapeProto* shape_proto = input->Shape();
+  size_t num_input_dims = shape_proto->dim_size();
+  ORT_ENFORCE(num_input_dims >= 2);
+
+  tvm::Array<tvm::Expr> output_shape;
+  // batch dimenion
+  output_shape.push_back(ShapeDimToTvmDim(shape_proto->dim(0), ctx_codegen));
+  // output channel
+  output_shape.push_back(ShapeDimToTvmDim(shape_proto->dim(1), ctx_codegen));
+
+  size_t kernel_sz = pool_attrs.kernel_shape.size();
+  if (pool_attrs.global_pooling) {
+    pool_attrs.pads.assign(kernel_sz, 0);
+    // skip batch and channel dimensions, so dim starts from 2
+    for (size_t dim = 2; dim < num_input_dims; dim++) {
+      output_shape.push_back(tvm::make_const(tvm::Int(32), 1));
+    }
+  } else {
+    ORT_ENFORCE(num_input_dims > kernel_sz);
+    size_t kernel_idx_offset = num_input_dims - kernel_sz;
+    for (size_t dim = 0; dim < kernel_sz; dim++) {
+      // TODO: handle symbolic dimensions
+      ORT_ENFORCE(ShapeHasValue(input, dim + kernel_idx_offset));
+      int64_t dim_val = ShapeValue(input, dim + kernel_idx_offset);
+      int64_t dim_size = 0;
+      pool_attrs.ComputeSizePadDilations(static_cast<int>(dim_val),
+                                         pool_attrs.strides[dim],
+                                         pool_attrs.kernel_shape[dim],
+                                         &(pool_attrs.pads[dim]),
+                                         &(pool_attrs.pads[kernel_sz + dim]),
+                                         pool_attrs.dilations[dim],
+                                         &dim_size);
+      output_shape.push_back(tvm::make_const(tvm::Int(32), dim_size));
+    }
+  }
+  return output_shape;
+}
+
+#define POOL_OP(name)                                                                             \
+  Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(name)::Evaluate(                                      \
+      const tvm::Array<tvm::Tensor>& inputs,                                                      \
+      const Node& node,                                                                           \
+      tvm_codegen::CodeGenContext& ctx_codegen,                                                   \
+      tvm::Array<tvm::Tensor>& outputs) {                                                         \
+    ORT_RETURN_IF_NOT(node.OutputDefs().size() == 1, " multiple outputs are not supported yet!"); \
+    ORT_RETURN_IF_NOT(inputs[0]->dtype == HalideIR::Float(32), " non-float32 not supported yet"); \
+    ProtoHelperNodeContext ctx(node);                                                             \
+    OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);                                         \
+    int version = ctx_codegen.GetCodeGenHandle()->domain_version_lookup_func(node.Domain());      \
+    PoolAttributes pool_attrs(info, #name, version);                                              \
+    for (auto n : pool_attrs.dilations) {                                                         \
+      ORT_RETURN_IF_NOT(n <= 1, "dilations are not supported yet!");                              \
+    }                                                                                             \
+    tvm::Array<tvm::Expr> output_shape = GetOutputShapeAndPads(node, pool_attrs, ctx_codegen);    \
+    tvm::Tensor Y = name(inputs[0], pool_attrs, output_shape);                                    \
+    outputs.push_back(Y);                                                                         \
+    return Status::OK();                                                                          \
+  }                                                                                               \
+
+LIST_X86_POOL_OPS()
+
+#undef POOL_OP
+
+}  // namespace nuphar
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/nuphar/kernel.h
+++ b/onnxruntime/core/providers/nuphar/kernel.h
@ -77,7 +77,8 @@ class NupharKernelState {
  NUPHAR_OP(Add, 7, DataTypeImpl::AllFixedSizeTensorTypes())                                 \
  NUPHAR_OP(ArgMax, 1, DataTypeImpl::AllFixedSizeTensorTypes())                              \
  NUPHAR_OP(ArgMin, 1, DataTypeImpl::AllFixedSizeTensorTypes())                              \
-  DISABLE_MACRO(NUPHAR_OP(AveragePool, 7, DataTypeImpl::AllFixedSizeTensorTypes()))          \
+  NUPHAR_VERSIONED_OP(AveragePool, 7, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())  \
+  NUPHAR_OP(AveragePool, 10, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())              \
  NUPHAR_OP(Ceil, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                                \
  NUPHAR_OP(Clip, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                                \
  NUPHAR_OP(Concat, 4, DataTypeImpl::AllFixedSizeTensorTypes())                              \
@ -94,8 +95,8 @@ class NupharKernelState {
  NUPHAR_OP(Floor, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                               \
  NUPHAR_VERSIONED_OP(Gemm, 7, 8, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())         \
  NUPHAR_OP(Gemm, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())                      \
-  DISABLE_MACRO(NUPHAR_OP(GlobalAveragePool, 1, DataTypeImpl::AllFixedSizeTensorTypes()))    \
-  DISABLE_MACRO(NUPHAR_OP(GlobalMaxPool, 1, DataTypeImpl::AllFixedSizeTensorTypes()))        \
+  NUPHAR_OP(GlobalAveragePool, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())         \
+  NUPHAR_OP(GlobalMaxPool, 1, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())             \
  NUPHAR_OP(Greater, 9, DataTypeImpl::AllFixedSizeTensorTypes())                             \
  NUPHAR_OP(HardSigmoid, 6, DataTypeImpl::AllIEEEFloatTensorTypes())                         \
  NUPHAR_OP(Identity, 1, DataTypeImpl::AllFixedSizeTensorTypes())                            \
@ -107,8 +108,9 @@ class NupharKernelState {
  NUPHAR_VERSIONED_OP(MatMul, 1, 8, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())       \
  NUPHAR_OP(MatMul, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())                    \
  NUPHAR_OP(Max, 8, DataTypeImpl::AllFixedSizeTensorTypes())                                 \
-  DISABLE_MACRO(NUPHAR_VERSIONED_OP(MaxPool, 1, 7, DataTypeImpl::AllFixedSizeTensorTypes())) \
-  DISABLE_MACRO(NUPHAR_OP(MaxPool, 8, DataTypeImpl::AllFixedSizeTensorTypes()))              \
+  NUPHAR_VERSIONED_OP(MaxPool, 1, 7, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())      \
+  NUPHAR_VERSIONED_OP(MaxPool, 8, 9, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())      \
+  NUPHAR_OP(MaxPool, 10, DataTypeImpl::AllIEEEFloatTensorExceptHalfTypes())                  \
  NUPHAR_OP(Min, 8, DataTypeImpl::AllFixedSizeTensorTypes())                                 \
  NUPHAR_OP(Mul, 7, DataTypeImpl::AllFixedSizeTensorTypes())                                 \
  NUPHAR_OP(Neg, 6, DataTypeImpl::AllFixedSizeTensorTypes())                                 \
--- a/onnxruntime/core/providers/nuphar/mti_x86/nn/pool_ops.cc
+++ b/onnxruntime/core/providers/nuphar/mti_x86/nn/pool_ops.cc
@ -0,0 +1,176 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/nuphar/mti_x86/nn/pool_ops.h"
+
+#include "core/codegen/mti/mti_tvm_utils.h"
+#include "core/mlas/inc/mlas.h"
+#include "core/providers/cpu/nn/pool_attributes.h"
+#include <topi/detail/extern.h>
+
+namespace onnxruntime {
+namespace nuphar {
+
+TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.pool_f32")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* /*ret*/) {
+      // input
+      DLTensor* X = args[0];
+      DCHECK(tvm::runtime::TypeMatch(X->dtype, kDLFloat, 32));
+      // output
+      DLTensor* Y = args[1];
+      DCHECK(tvm::runtime::TypeMatch(Y->dtype, kDLFloat, 32));
+
+      // enum is not an integral type
+      int k = args[2];
+      MLAS_POOLING_KIND kind = static_cast<MLAS_POOLING_KIND>(k);
+
+      int num_args = args.size();
+      DCHECK(num_args > 3);
+      int arg_idx = 3;
+
+      auto extract_values_fn = [&]() {
+        std::vector<int64_t> vec;
+
+        DCHECK(arg_idx < num_args);
+        int64_t num_vec = args[arg_idx++];
+        for (int i = 0; i < num_vec; i++, arg_idx++) {
+          DCHECK(arg_idx < num_args);
+          int64_t v = args[arg_idx];
+          vec.push_back(v);
+        }
+        return vec;
+      };
+
+      std::vector<int64_t> kernel_shape = extract_values_fn();
+      std::vector<int64_t> padding = extract_values_fn();
+      std::vector<int64_t> strides = extract_values_fn();
+
+      MlasPool(kind,
+               /*num_pooling_dims*/ kernel_shape.size(),
+               /*input_shape*/ X->shape,
+               kernel_shape.data(),
+               padding.data(),
+               strides.data(),
+               /*output_shape*/ Y->shape,
+               reinterpret_cast<float*>(static_cast<char*>(X->data) + X->byte_offset),
+               reinterpret_cast<float*>(static_cast<char*>(Y->data) + Y->byte_offset),
+               /*thread_pool*/ nullptr);
+    });
+
+TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.global_pool_f32")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* /*ret*/) {
+      // input
+      DLTensor* X = args[0];
+      DCHECK(tvm::runtime::TypeMatch(X->dtype, kDLFloat, 32));
+      // output
+      DLTensor* Y = args[1];
+      DCHECK(tvm::runtime::TypeMatch(Y->dtype, kDLFloat, 32));
+
+      // enum is not an integral type
+      int k = args[2];
+      MLAS_POOLING_KIND kind = static_cast<MLAS_POOLING_KIND>(k);
+
+      MlasPool(kind,
+               /*num_pooling_dims*/ X->ndim - 2,
+               /*input_shape*/ X->shape,
+               /*kernel_shape*/ nullptr,
+               /*padding*/ nullptr,
+               /*strides*/ nullptr,
+               /*output_shape*/ Y->shape,
+               reinterpret_cast<float*>(static_cast<char*>(X->data) + X->byte_offset),
+               reinterpret_cast<float*>(static_cast<char*>(Y->data) + Y->byte_offset),
+               /*thread_pool*/ nullptr);
+    });
+
+static tvm::Tensor MakeGlobalPoolCommon(const tvm::Tensor& X,
+                                        const MLAS_POOLING_KIND kind,
+                                        const tvm::Array<tvm::Expr>& output_shape,
+                                        const std::string& name) {
+  return topi::detail::make_extern(
+           /*output_shapes*/ {output_shape},
+           /*output_types*/ {X->dtype},
+           /*inputs*/ {X},
+           [&](tvm::Array<tvm::Buffer> ins, tvm::Array<tvm::Buffer> outs) {
+             return topi::detail::call_packed({tvm::Expr("tvm.contrib.onnxruntime.global_pool_f32"),
+                                               topi::detail::pack_buffer(ins[0]),
+                                               topi::detail::pack_buffer(outs[0]),
+                                               static_cast<int>(kind)});
+           },
+           name, /*tag*/ "", /*attrs*/ {})[0];
+}
+
+static tvm::Tensor MakePoolCommon(const tvm::Tensor& X,
+                                  const PoolAttributes& pool_attrs,
+                                  const MLAS_POOLING_KIND kind,
+                                  const tvm::Array<tvm::Expr>& output_shape,
+                                  const std::string& name) {
+  size_t num_input_dims = X.ndim();
+  ORT_ENFORCE(num_input_dims >= 3, "Input dimension must be >= 3");
+  size_t num_pooling_dims = num_input_dims - 2;
+  ORT_ENFORCE(num_pooling_dims <= 3, "pooling size must be <= 3");
+  ORT_ENFORCE(num_pooling_dims == pool_attrs.kernel_shape.size(),
+              "kernel_shape num_dims is not compatible with X num_dims.");
+
+  tvm::Array<tvm::Expr> pooling_args;
+  auto add_args_fn = [&](const std::vector<int64_t>& v) {
+    pooling_args.push_back(tvm::make_const(tvm::Int(64), static_cast<int64_t>(v.size())));
+    for (auto n : v) {
+      pooling_args.push_back(tvm::make_const(tvm::Int(64), n));
+    }
+  };
+  add_args_fn(pool_attrs.kernel_shape);
+  add_args_fn(pool_attrs.pads);
+  add_args_fn(pool_attrs.strides);
+
+  return topi::detail::make_extern(
+           /*output_shapes*/ {output_shape},
+           /*output_types*/ {X->dtype},
+           /*inputs*/ {X},
+           [&](tvm::Array<tvm::Buffer> ins, tvm::Array<tvm::Buffer> outs) {
+             tvm::Array<tvm::Expr> args = {tvm::Expr("tvm.contrib.onnxruntime.pool_f32"),
+                                           topi::detail::pack_buffer(ins[0]),
+                                           topi::detail::pack_buffer(outs[0]),
+                                           static_cast<int>(kind)};
+             // kernel_shape, padds and strides are directly passed into the external function
+             for (size_t i = 0; i < pooling_args.size(); i++) {
+               args.push_back(pooling_args[i]);
+             }
+             return topi::detail::call_packed(args);
+           },
+           name, /*tag*/ "", /*attrs*/ {})[0];
+}
+
+tvm::Tensor AveragePool(const tvm::Tensor& X,
+                        const PoolAttributes& pool_attrs,
+                        const tvm::Array<tvm::Expr>& output_shape,
+                        const std::string& name) {
+  MLAS_POOLING_KIND kind = pool_attrs.count_include_pad ? MlasAveragePoolingIncludePad
+                                                        : MlasAveragePoolingExcludePad;
+  return MakePoolCommon(X, pool_attrs, kind, output_shape, name);
+}
+
+tvm::Tensor GlobalAveragePool(const tvm::Tensor& X,
+                              const PoolAttributes& pool_attrs,
+                              const tvm::Array<tvm::Expr>& output_shape,
+                              const std::string& name) {
+  MLAS_POOLING_KIND kind = pool_attrs.count_include_pad ? MlasAveragePoolingIncludePad
+                                                        : MlasAveragePoolingExcludePad;
+  return MakeGlobalPoolCommon(X, kind, output_shape, name);
+}
+
+tvm::Tensor MaxPool(const tvm::Tensor& X,
+                    const PoolAttributes& pool_attrs,
+                    const tvm::Array<tvm::Expr>& output_shape,
+                    const std::string& name) {
+  return MakePoolCommon(X, pool_attrs, MlasMaximumPooling, output_shape, name);
+}
+
+tvm::Tensor GlobalMaxPool(const tvm::Tensor& X,
+                          const PoolAttributes& /*pool_attrs*/,
+                          const tvm::Array<tvm::Expr>& output_shape,
+                          const std::string& name) {
+  return MakeGlobalPoolCommon(X, MlasMaximumPooling, output_shape, name);
+}
+
+}  // namespace nuphar
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/nuphar/mti_x86/nn/pool_ops.h
+++ b/onnxruntime/core/providers/nuphar/mti_x86/nn/pool_ops.h
@ -0,0 +1,37 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+#include <tvm/tvm.h>
+
+namespace onnxruntime {
+
+// Forward declaration
+struct PoolAttributes;
+
+namespace nuphar {
+
+tvm::Tensor AveragePool(const tvm::Tensor& X,
+                        const PoolAttributes& pool_attrs,
+                        const tvm::Array<tvm::Expr>& output_shape,
+                        const std::string& name = "average_pool");
+
+tvm::Tensor GlobalAveragePool(const tvm::Tensor& X,
+                              const PoolAttributes& pool_attrs,
+                              const tvm::Array<tvm::Expr>& output_shape,
+                              const std::string& name = "global_average_pool");
+
+tvm::Tensor MaxPool(const tvm::Tensor& X,
+                    const PoolAttributes& pool_attrs,
+                    const tvm::Array<tvm::Expr>& output_shape,
+                    const std::string& name = "max_pool");
+
+tvm::Tensor GlobalMaxPool(const tvm::Tensor& X,
+                          const PoolAttributes& pool_attrs,
+                          const tvm::Array<tvm::Expr>& output_shape,
+                          const std::string& name = "global_max_pool");
+
+}  // namespace nuphar
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc
+++ b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc
@ -193,6 +193,23 @@ NupharExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie
    if (node.OpType() == "Tile" && !graph_viewer.IsConstantInitializer(inputs[1]->Name(), true))
      return false;  // do not support tile that has dynamic repeats

+    if (node.OpType() == "MaxPool") {
+      // TODO: enable support for Indices
+      if (node.OutputDefs().size() > 1) {
+        return false;
+      }
+      // TODO: enable support for non-default dilations
+      const onnxruntime::NodeAttributes& attrs = node.GetAttributes();
+      auto it = attrs.find("dilations");
+      if (it != attrs.end()) {
+        for (int i = 0; i < it->second.ints_size(); i++) {
+          if (it->second.ints(i) > 1) {
+            return false;
+          }
+        }
+      }
+    }
+
    if (node.OpType() == "Slice") {
      auto num_inputs = inputs.size();
      ORT_ENFORCE(num_inputs > 0);