[NupharEP] Enable parallel schedule (#2505)

* [NupharEP] Enable parallel schedule * Update TVM with the fix to TVM threadpool to use OpenMP if possible * Add parallel schedule when trying to vectorize With this change, BERT squad perf on a 4-core (8 HT) CPU goes from 187ms to 150ms * Address CR, docs and cmake update * Doc fix * Fix mkl * Fix TVM windows build when using mklml
2026-07-20 19:12:24 +00:00 · 2019-11-28 08:35:56 -08:00 · 2019-11-28 08:35:56 -08:00 · 60208463a9
commit 60208463a9
parent 005305be6e
16 changed files with 170 additions and 32 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -343,6 +343,11 @@ if (onnxruntime_USE_ACL)
  list(APPEND onnxruntime_EXTERNAL_LIBRARIES arm_compute acl arm_compute_graph arm_compute_core)
 endif()

+# MKLML
+if (onnxruntime_USE_MKLDNN OR onnxruntime_USE_MKLML)
+  include(mkldnn)
+endif()
+
 # TVM
 if (onnxruntime_USE_TVM)
  if (onnxruntime_USE_CUDA)
@ -352,6 +357,19 @@ if (onnxruntime_USE_TVM)
    set(USE_LLVM ON)
    add_definitions(-DUSE_TVM_WITH_LLVM)
  endif()
+  if (onnxruntime_USE_OPENMP)
+    set(USE_OPENMP "gnu")
+  endif()
+  if (onnxruntime_USE_MKLML)
+    set(USE_OPENMP "intel")
+    # make sure MKLML in ORT is used by TVM
+    if (WIN32)
+      set(OMP_LIBRARY ${MKLML_LIB_DIR}/${IOMP5MD_IMPORT_LIB})
+    else()
+      set(OMP_LIBRARY ${MKLML_LIB_DIR}/${IOMP5MD_SHARED_LIB})
+    endif()
+  endif()
+
  add_subdirectory(${PROJECT_SOURCE_DIR}/external/tvm EXCLUDE_FROM_ALL)
  set_target_properties(tvm PROPERTIES FOLDER "External/tvm")
  set_target_properties(tvm_topi PROPERTIES FOLDER "External/tvm")
@ -501,10 +519,6 @@ include_directories(
  ${REPO_ROOT}/include/onnxruntime/core/session
 )

-if (onnxruntime_USE_MKLDNN OR onnxruntime_USE_MKLML)
-  include(mkldnn)
-endif()
-
 if(onnxruntime_USE_GEMMLOWP)
  add_definitions(-DUSE_GEMMLOWP=1)
 endif()
--- a/cmake/external/tvm
+++ b/cmake/external/tvm
@ -1 +1 @@
-Subproject commit 9b3a424a91d6003db1993cdd7121e46696f220e8
+Subproject commit c6e3efcdb09aeda961a6badf76093ceac69db64d
--- a/docs/execution_providers/Nuphar-ExecutionProvider.md
+++ b/docs/execution_providers/Nuphar-ExecutionProvider.md
@ -17,6 +17,12 @@ You can use the Nuphar execution provider via the python wheel from the ONNX Run
 ## Performance and Accuracy Testing
 You can test your ONNX model's performance with [onnxruntime_perf_test](../../onnxruntime/test/perftest/README.md), or test accuracy with [onnx_test_runner](../../onnxruntime/test/onnx/README.txt). To run these tools with the Nuphar execution provider, please pass `-e nuphar` in command line options.

+Please note that Nuphar uses TVM thread pool and parallel schedule for multi-thread inference performance. When building with OpenMP or MKLML, TVM thread pool would use gomp or iomp as its implementation; otherwise, TVM creates its own thread pool. Because of this, the current default parallel schedule policy is:
+- Default to on for USE_OPENMP or USE_MKLML. User can use OMP_NUM_THREADS/MKL_NUM_THREADS to control TVM thread pool, as well as TVM_NUM_THREADS
+- Default to off for none of above. User can use TVM_NUM_THREADS to control TVM thread pool.
+
+This choice is to ensure to get ideal performance with the different build options. When build with USE_OPENMP or USE_MKLML, users would have to avoid thread confliction from OpenMP or MKL with their inference invocations anyway, so parallel schedule is enable to leverage existing thread pool. When not building with gomp or iomp, TVM thread pool is turned off to avoid confliction with user threads. If needed, user can set env or settings with [NUPHAR_PARALLEL_MIN_WORKLOADS](../../onnxruntime/core/providers/nuphar/common/nuphar_settings.cc#L61) to 0 to disable parallel schedule, or to some non-zero value to enable parallel schedule. The non-zero value indicates the minimal number of elements being computed per thread when parallel schedule would be turned on.
+
 ## Model Conversion and Quantization
 You may use Python script [model_editor.py](../../onnxruntime/core/providers/nuphar/scripts/model_editor.py) to turn LSTM/GRU/RNN ops to Scan ops for a given model, and then use [model_quantizer.py](../../onnxruntime/core/providers/nuphar/scripts/model_quantizer.py) to quantize MatMul ops into MatMulInteger ops.

--- a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc
+++ b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc
@ -58,6 +58,19 @@ bool InsertRootScheduleAndClosure(
  return true;
 }

+// Check precondition for vectorize schedule
+bool ShouldTryVectorization(
+    const tvm::Tensor& tensor,
+    ScheduleContext& ctx) {
+  auto it = ctx.scheduled_tensors.find(tensor->op.get());
+  if (it != ctx.scheduled_tensors.end()) {
+    if (it->second > ScheduleType::ScheduleInline) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // Check the schedule of tensor
 // If it is not scheduled, try to vectorize it.
 // Note TryVectorization has to use with compute_root.
@ -66,12 +79,8 @@ bool TryVectorization(
    const tvm::Tensor& tensor,
    int64_t natural_vector_size,
    ScheduleContext& ctx) {
-  auto it = ctx.scheduled_tensors.find(tensor->op.get());
-  if (it != ctx.scheduled_tensors.end()) {
-    if (it->second > ScheduleType::ScheduleInline) {
-      return false;
-    }
-  }
+  if (!ShouldTryVectorization(tensor, ctx))
+    return false;

  auto shape = tensor->shape;
  auto rank = shape.size();
--- a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h
+++ b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h
@ -26,6 +26,11 @@ bool InsertRootScheduleAndClosure(
    const tvm::Tensor& tensor,
    ScheduleContext& ctx);

+// Check precondition for vectorize schedule
+bool ShouldTryVectorization(
+    const tvm::Tensor& tensor,
+    ScheduleContext& ctx);
+
 // Check the schedule of tensor
 // If it is not scheduled, try to vectorize it.
 // Note TryVectorization has to use with compute_root.
--- a/onnxruntime/core/providers/nuphar/common/nuphar_settings.cc
+++ b/onnxruntime/core/providers/nuphar/common/nuphar_settings.cc
@ -38,7 +38,8 @@ static const std::unordered_set<std::string> valid_keys = {
    kNupharCacheSoName,
    kNupharCacheModelChecksum,
    kNupharCacheForceNoJIT,
-    kNupharCodeGenTarget};
+    kNupharCodeGenTarget,
+    kNupharParallelMinWorkloads};

 void SetDefaultOptions(std::map<std::string, std::string>& options) {
  // create two temporary strings to get rid of the odr-use issue introduced
@ -56,6 +57,20 @@ void SetDefaultOptions(std::map<std::string, std::string>& options) {
  std::string cache_so_name_opt(kNupharCacheSoName);
  std::string cache_so_name_default(kNupharCacheSoName_Default);
  options.insert(std::make_pair(cache_so_name_opt, cache_so_name_default));
+
+  std::string parallel_min_workloads_opt(kNupharParallelMinWorkloads);
+#if defined(USE_OPENMP) || defined(USE_MKLML)
+  // a rough estimate of workloads based on static dimensions for each thread, when using parallel schedule
+  // user may change it to 0 to turn it off,
+  // or use OMP_NUM_THREADS to control TVM thread pool similar to control MKL
+  unsigned int parallel_min_workloads_default = 64;
+#else
+  // turn off parallel schedule by default to avoid TVM thread pool confliction with others
+  // this is to ensure performance when user runs multiple inference threads, with each runs as single thread
+  // if needed, user can override it with settings, and use TVM_NUM_THREADS to control the thread pool
+  unsigned int parallel_min_workloads_default = 0;
+#endif
+  options.insert(std::make_pair(parallel_min_workloads_opt, std::to_string(parallel_min_workloads_default)));
 }

 void CreateNupharCodeGenSettings(const NupharExecutionProviderInfo& info) {
--- a/onnxruntime/core/providers/nuphar/common/nuphar_settings.h
+++ b/onnxruntime/core/providers/nuphar/common/nuphar_settings.h
@ -45,6 +45,9 @@ constexpr static const char* kNupharActivations_DeepCpu = "deep_cpu_activation";
 // Option to control nuphar code generation target (avx / avx2 / avx512)
 constexpr static const char* kNupharCodeGenTarget = "nuphar_codegen_target";

+// Option to control nuphar code to run with parallel schedule
+constexpr static const char* kNupharParallelMinWorkloads = "nuphar_parallel_min_workloads";
+
 // cache version number (MAJOR.MINOR.PATCH) following https://semver.org/
 // 1. MAJOR version when you make incompatible changes that old cache files no longer work,
 // 2. MINOR version when you add functionality in a backwards - compatible manner, and
--- a/onnxruntime/core/providers/nuphar/compiler/nuphar_handle.h
+++ b/onnxruntime/core/providers/nuphar/compiler/nuphar_handle.h
@ -28,7 +28,7 @@ struct NupharCodeGenHandle : codegen::CodeGenHandle {
  std::shared_ptr<tvm_codegen::TVMScheduleBuilder> schedule_builder;  // keep
  // maybe add a layout
  tvm_codegen::WeightLayoutRegistry* layout_registry;
-  bool enable_per_node_parallelized;  // TODO: change to config
+  int64_t parallel_min_workloads;

  bool allow_unaligned_buffers;  // move to another place

--- a/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc
@ -9,6 +9,7 @@

 #include "core/providers/nuphar/common/analysis/subgraph_codegen_stats.h"
 #include "core/providers/nuphar/compiler/x86/x86_target_info.h"
+#include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h"

 // TODO change name space
 namespace onnxruntime {
@ -37,9 +38,8 @@ static void Traverse(const tvm::Tensor& tensor,
  if (is_real_output) {
    CodeGenTargetX86* target = dynamic_cast<CodeGenTargetX86*>(ctx_codegen.GetCodeGenHandle()->codegen_target);
    ORT_ENFORCE(target != nullptr);
-    int64_t natural_vector_size = target->NaturalVectorWidth(tensor->dtype.bits());

-    TryVectorization(tensor, natural_vector_size, ctx_schedule);  // to x86
+    TryVectorizationX86(tensor, ctx_codegen, ctx_schedule);
    InsertRootScheduleAndClosure(tensor, ctx_schedule);
  }

--- a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h
@ -39,5 +39,13 @@ bool InputRootScheduleWithVectorizationX86(
    tvm_codegen::CodeGenContext& ctx_codegen,
    tvm_codegen::ScheduleContext& ctx_sched);

+bool TryParallelX86(
+    const tvm::Tensor& tensor,
+    int64_t to_dim,  // fuse dims before to_dim for parallel schedule, 0 to fuse all but last dim
+    tvm_codegen::CodeGenContext& ctx_codegen,
+    tvm_codegen::ScheduleContext& ctx_sched);
+
+constexpr auto kNupharScheduleNoParallel = "nuphar_schedule_no_parallel";
+
 }  // namespace nuphar
 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/ort_type_schedule.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/ort_type_schedule.cc
@ -3,26 +3,99 @@

 #include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h"

+#include "core/codegen/passes/scheduler/schedule_utils.h"
+#include "core/framework/op_kernel_info.h"
+#include "core/providers/nuphar/common/nuphar_settings.h"
 #include "core/providers/nuphar/common/analysis/subgraph_codegen_stats.h"
 #include "core/providers/nuphar/compiler/nuphar_codegen_ctx.h"
-#include "core/codegen/passes/scheduler/schedule_utils.h"
 #include "core/providers/nuphar/compiler/x86/scheduler/tensorize/intrin_gemv_ll_extern.h"
 #include "core/providers/nuphar/compiler/x86/scheduler/tensorize/intrin_gemv_ll_ir.h"
 #include "core/providers/nuphar/compiler/x86/x86_target_info.h"
-#include "core/framework/op_kernel_info.h"
 #include <tvm/tvm.h>
+#include <tvm/ir_pass.h>

 namespace onnxruntime {
 namespace nuphar {

+bool TryParallelX86(
+    const tvm::Tensor& tensor,
+    int64_t to_dim,
+    tvm_codegen::CodeGenContext& ctx_codegen,
+    tvm_codegen::ScheduleContext& ctx_sched) {
+  auto compute_op = tensor->op.as<tvm::ComputeOpNode>();
+  if (compute_op == nullptr) {
+    return false;
+  }
+  if (compute_op->attrs.count(kNupharScheduleNoParallel)) {
+    return false;
+  }
+
+  const auto& shape = tensor->shape;
+
+  int rank = gsl::narrow<int>(shape.size());
+  tvm::Array<tvm::IterVar> to_fuse_for_parallel;
+  int64_t rank_to_parallel = (to_dim ? to_dim : rank - 1);
+  for (int64_t i = 0; i < rank_to_parallel && i < gsl::narrow<int64_t>(compute_op->axis.size()); ++i) {
+    tvm::IterVar axis = compute_op->axis[i];
+    auto dom = axis->dom;
+    if (!tvm::ir::Equal(dom->extent, shape[i])) {
+      // only do parallel schedule on axis not being fused or split yet
+      rank_to_parallel = i;
+      break;
+    }
+    to_fuse_for_parallel.push_back(axis);
+  }
+
+  if (to_fuse_for_parallel.size() < 1) {
+    return false;
+  }
+
+  int64_t per_thread_static_dims = 1;
+  for (const auto& reduce_axis : compute_op->reduce_axis) {
+    const int64_t* static_range = tvm::as_const_int(reduce_axis->dom->extent);
+    if (static_range != nullptr) {
+      per_thread_static_dims *= *static_range;
+    }
+  }
+  for (int64_t i = rank_to_parallel; i < rank; ++i) {
+    auto dim = tvm::as_const_int(shape[i]);
+    if (dim != nullptr) {
+      per_thread_static_dims *= *dim;
+    }
+  }
+
+  // skip small per thread workloads, note that symbolic dims are ignored (treated as 1)
+  int64_t workloads_threshold = Promote<NupharCodeGenCtx>(&ctx_codegen)->GetCodeGenHandle()->parallel_min_workloads;
+  if (workloads_threshold <= 0 || per_thread_static_dims < workloads_threshold) {
+    return false;
+  }
+
+  tvm::IterVar parallel_axis;
+  if (to_fuse_for_parallel.size() > 1) {
+    ctx_sched.schedule[tensor->op].fuse(to_fuse_for_parallel, &parallel_axis);
+  } else {
+    parallel_axis = to_fuse_for_parallel[0];
+  }
+  ctx_sched.schedule[tensor->op].parallel(parallel_axis);
+  return true;
+}
+
 bool TryVectorizationX86(
    const tvm::Tensor& tensor,
    tvm_codegen::CodeGenContext& ctx_codegen,
    tvm_codegen::ScheduleContext& ctx_sched) {
+  if (!ShouldTryVectorization(tensor, ctx_sched))
+    return false;
+
  CodeGenTargetX86* target = dynamic_cast<CodeGenTargetX86*>(ctx_codegen.GetCodeGenHandle()->codegen_target);
  ORT_ENFORCE(target != nullptr);
  int64_t natural_vector_size = target->NaturalVectorWidth(tensor->dtype.bits());

+  // try to use parallel schedule when vectorizing
+  // note that we don't do logic-or in return value here
+  // to make sure vectorization is always tried
+  TryParallelX86(tensor, 0, ctx_codegen, ctx_sched);
+
  return TryVectorization(tensor, natural_vector_size, ctx_sched);
 }

@ -176,7 +249,7 @@ static Status ConvScheduleX86(const tvm::Tensor& tensor,

  ctx_sched.schedule[tensor->op].reorder({b, oc_chunk, y, xo, ic_chunk, m, n, ic_block, xi, oc_block});

-  if (ctx_codegen.GetCodeGenHandle()->enable_per_node_parallelized) {
+  if (ctx_codegen.GetCodeGenHandle()->parallel_min_workloads > 0) {
    tvm::Array<tvm::IterVar> fused_axis;
    fused_axis.push_back(b);
    fused_axis.push_back(oc_chunk);
@ -186,6 +259,7 @@ static Status ConvScheduleX86(const tvm::Tensor& tensor,
    ctx_sched.schedule[tensor->op].fuse(fused_axis, &parallel_axis);
    ctx_sched.schedule[tensor->op].parallel(parallel_axis);
  }
+
  ctx_sched.schedule[tensor->op].vectorize(oc_block);

  return Status::OK();
@ -243,7 +317,7 @@ static Status MatMul_2DWeight_Schedule(
  ctx_sched.schedule[CC->op].unroll(ki);
  ctx_sched.schedule[CC->op].vectorize(yc);

-  if (ctx_codegen.GetCodeGenHandle()->enable_per_node_parallelized) {
+  if (ctx_codegen.GetCodeGenHandle()->parallel_min_workloads > 0) {
    // parallelize
    tvm::Array<tvm::IterVar> fused_axis;
    for (size_t d = 0; d < C_rank - 2; ++d)
--- a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/tvm_rule_schedule.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/tvm_rule_schedule.cc
@ -22,6 +22,7 @@ bool TVM_SCHEDULER_CLASS(Extern, NupharX86TVMRule)::Evaluate(

 static bool ReduceVScheduleNupharX86(
    const tvm::Tensor& tensor,
+    tvm_codegen::CodeGenContext& ctx_codegen,
    tvm_codegen::ScheduleContext& ctx_sched) {
  InsertRootScheduleAndClosure(tensor, ctx_sched);

@ -55,6 +56,8 @@ static bool ReduceVScheduleNupharX86(
    if (shape.size() > 0)
      head_dim = as_const_int(shape[0]);

+    bool try_parallel = true;
+
    // unroll packed reduce by checking head dim
    if (nullptr != head_dim) {
      // if head_dim is already fused, don't unroll
@ -81,8 +84,13 @@ static bool ReduceVScheduleNupharX86(

        ctx_sched.schedule[tensor->op].reorder(reorder_axis);
        ctx_sched.schedule[tensor->op].unroll(x0);
+        try_parallel = false;
      }
    }
+
+    if (try_parallel) {
+      TryParallelX86(tensor, *fuse_dim, ctx_codegen, ctx_sched);
+    }
  } else if (compute_op->axis.size() > 0 &&
             tvm::as_const_int(tensor->shape[0]) != nullptr) {
    tvm::IterVar x = compute_op->axis[0];
@ -101,7 +109,7 @@ static bool ReduceVScheduleNupharX86(
 bool TVM_SCHEDULER_CLASS(Reduce, NupharX86TVMRule)::Evaluate(
    const tvm::Tensor& tensor,
    const Node*,
-    tvm_codegen::CodeGenContext&,
+    tvm_codegen::CodeGenContext& ctx_codegen,
    tvm_codegen::ScheduleContext& ctx_sched) {
  // respect topi::kCommReduce
  if (tensor->op->tag == topi::kCommReduce) {
@ -109,7 +117,7 @@ bool TVM_SCHEDULER_CLASS(Reduce, NupharX86TVMRule)::Evaluate(
  }

  if (tensor->op->tag == nuphar::kNupharVReduce) {
-    return ReduceVScheduleNupharX86(tensor, ctx_sched);
+    return ReduceVScheduleNupharX86(tensor, ctx_codegen, ctx_sched);
  }

  // unknown goes to InsertRootScheduleAndClosure
--- a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc
+++ b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc
@ -7,6 +7,7 @@
 #include "core/codegen/mti/mti_tvm_utils.h"
 #include "core/codegen/mti/tensor/pad_ops.h"
 #include "core/codegen/mti/tensor/reshape_ops.h"
+#include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h"
 #include <topi/reduction.h>

 namespace onnxruntime {
@ -85,7 +86,7 @@ tvm::Tensor ReduceValueWithoutSplit(const tvm::Tensor& X,

  tvm::Map<std::string, tvm::NodeRef> attrs;
  attrs.Set(kNupharVReduceFuseDim, tvm::Expr(fuse_dim));
-
+  attrs.Set(kNupharScheduleNoParallel, tvm::Expr(true));
  return tvm::compute(output_shape, l_out, name + "_regular_reduce", kNupharVReduce, attrs);
 }

--- a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc
+++ b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc
@ -125,8 +125,8 @@ NupharExecutionProvider::NupharExecutionProvider(const NupharExecutionProviderIn

  handle->shape_inference = whole_graph_shape_infer_;

-  // TODO: remove
-  handle->enable_per_node_parallelized = info.enable_per_node_parallel;
+  handle->parallel_min_workloads = std::stoi(settings.GetOptionValue(kNupharParallelMinWorkloads));
+
  // TODO: remove
  handle->allow_unaligned_buffers = info.allow_unaligned_buffers;  // TODO remove this

--- a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.h
+++ b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.h
@ -31,9 +31,6 @@ constexpr const char* default_nuphar_target_str = stackvm_target_str;

 // Information needed to construct Nuphar execution providers.
 struct NupharExecutionProviderInfo {
-  // By default, let provider decide the target by passing in empty string.
-  bool enable_per_node_parallel;  // TODO: remove
-
  // this flag set TVM build_config with data_alignment=1, at the cost of performance
  bool allow_unaligned_buffers;

@ -43,10 +40,8 @@ struct NupharExecutionProviderInfo {
  std::string settings;

  explicit NupharExecutionProviderInfo(bool unaligned_buffers,
-                                       const std::string& str_settings = "",
-                                       bool per_node_parallel = true)
-      : enable_per_node_parallel(per_node_parallel),
-        allow_unaligned_buffers(unaligned_buffers),
+                                       const std::string& str_settings = "")
+      : allow_unaligned_buffers(unaligned_buffers),
        settings(str_settings) {}
  NupharExecutionProviderInfo() = default;
 };
--- a/onnxruntime/core/providers/nuphar/nuphar_provider_factory.cc
+++ b/onnxruntime/core/providers/nuphar/nuphar_provider_factory.cc
@ -21,7 +21,7 @@ struct NupharExecutionProviderFactory : IExecutionProviderFactory {
 };

 std::unique_ptr<IExecutionProvider> NupharExecutionProviderFactory::CreateProvider() {
-  NupharExecutionProviderInfo info(allow_unaligned_buffers_, settings_, /*per_node_parallel*/ true);
+  NupharExecutionProviderInfo info(allow_unaligned_buffers_, settings_);
  return onnxruntime::make_unique<NupharExecutionProvider>(info);
 }