[NupharEP] Enable parallel schedule (#2505)

* [NupharEP] Enable parallel schedule
* Update TVM with the fix to TVM threadpool to use OpenMP if possible
* Add parallel schedule when trying to vectorize
With this change, BERT squad perf on a 4-core (8 HT) CPU goes from 187ms to 150ms

* Address CR, docs and cmake update

* Doc fix

* Fix mkl

* Fix TVM windows build when using mklml
This commit is contained in:
KeDengMS 2019-11-28 08:35:56 -08:00 committed by GitHub
parent 005305be6e
commit 60208463a9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 170 additions and 32 deletions

View file

@ -343,6 +343,11 @@ if (onnxruntime_USE_ACL)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES arm_compute acl arm_compute_graph arm_compute_core)
endif()
# MKLML
if (onnxruntime_USE_MKLDNN OR onnxruntime_USE_MKLML)
include(mkldnn)
endif()
# TVM
if (onnxruntime_USE_TVM)
if (onnxruntime_USE_CUDA)
@ -352,6 +357,19 @@ if (onnxruntime_USE_TVM)
set(USE_LLVM ON)
add_definitions(-DUSE_TVM_WITH_LLVM)
endif()
if (onnxruntime_USE_OPENMP)
set(USE_OPENMP "gnu")
endif()
if (onnxruntime_USE_MKLML)
set(USE_OPENMP "intel")
# make sure MKLML in ORT is used by TVM
if (WIN32)
set(OMP_LIBRARY ${MKLML_LIB_DIR}/${IOMP5MD_IMPORT_LIB})
else()
set(OMP_LIBRARY ${MKLML_LIB_DIR}/${IOMP5MD_SHARED_LIB})
endif()
endif()
add_subdirectory(${PROJECT_SOURCE_DIR}/external/tvm EXCLUDE_FROM_ALL)
set_target_properties(tvm PROPERTIES FOLDER "External/tvm")
set_target_properties(tvm_topi PROPERTIES FOLDER "External/tvm")
@ -501,10 +519,6 @@ include_directories(
${REPO_ROOT}/include/onnxruntime/core/session
)
if (onnxruntime_USE_MKLDNN OR onnxruntime_USE_MKLML)
include(mkldnn)
endif()
if(onnxruntime_USE_GEMMLOWP)
add_definitions(-DUSE_GEMMLOWP=1)
endif()

2
cmake/external/tvm vendored

@ -1 +1 @@
Subproject commit 9b3a424a91d6003db1993cdd7121e46696f220e8
Subproject commit c6e3efcdb09aeda961a6badf76093ceac69db64d

View file

@ -17,6 +17,12 @@ You can use the Nuphar execution provider via the python wheel from the ONNX Run
## Performance and Accuracy Testing
You can test your ONNX model's performance with [onnxruntime_perf_test](../../onnxruntime/test/perftest/README.md), or test accuracy with [onnx_test_runner](../../onnxruntime/test/onnx/README.txt). To run these tools with the Nuphar execution provider, please pass `-e nuphar` in command line options.
Please note that Nuphar uses TVM thread pool and parallel schedule for multi-thread inference performance. When building with OpenMP or MKLML, TVM thread pool would use gomp or iomp as its implementation; otherwise, TVM creates its own thread pool. Because of this, the current default parallel schedule policy is:
- Default to on for USE_OPENMP or USE_MKLML. User can use OMP_NUM_THREADS/MKL_NUM_THREADS to control TVM thread pool, as well as TVM_NUM_THREADS
- Default to off for none of above. User can use TVM_NUM_THREADS to control TVM thread pool.
This choice is to ensure to get ideal performance with the different build options. When build with USE_OPENMP or USE_MKLML, users would have to avoid thread confliction from OpenMP or MKL with their inference invocations anyway, so parallel schedule is enable to leverage existing thread pool. When not building with gomp or iomp, TVM thread pool is turned off to avoid confliction with user threads. If needed, user can set env or settings with [NUPHAR_PARALLEL_MIN_WORKLOADS](../../onnxruntime/core/providers/nuphar/common/nuphar_settings.cc#L61) to 0 to disable parallel schedule, or to some non-zero value to enable parallel schedule. The non-zero value indicates the minimal number of elements being computed per thread when parallel schedule would be turned on.
## Model Conversion and Quantization
You may use Python script [model_editor.py](../../onnxruntime/core/providers/nuphar/scripts/model_editor.py) to turn LSTM/GRU/RNN ops to Scan ops for a given model, and then use [model_quantizer.py](../../onnxruntime/core/providers/nuphar/scripts/model_quantizer.py) to quantize MatMul ops into MatMulInteger ops.

View file

@ -58,6 +58,19 @@ bool InsertRootScheduleAndClosure(
return true;
}
// Check precondition for vectorize schedule
bool ShouldTryVectorization(
const tvm::Tensor& tensor,
ScheduleContext& ctx) {
auto it = ctx.scheduled_tensors.find(tensor->op.get());
if (it != ctx.scheduled_tensors.end()) {
if (it->second > ScheduleType::ScheduleInline) {
return false;
}
}
return true;
}
// Check the schedule of tensor
// If it is not scheduled, try to vectorize it.
// Note TryVectorization has to use with compute_root.
@ -66,12 +79,8 @@ bool TryVectorization(
const tvm::Tensor& tensor,
int64_t natural_vector_size,
ScheduleContext& ctx) {
auto it = ctx.scheduled_tensors.find(tensor->op.get());
if (it != ctx.scheduled_tensors.end()) {
if (it->second > ScheduleType::ScheduleInline) {
return false;
}
}
if (!ShouldTryVectorization(tensor, ctx))
return false;
auto shape = tensor->shape;
auto rank = shape.size();

View file

@ -26,6 +26,11 @@ bool InsertRootScheduleAndClosure(
const tvm::Tensor& tensor,
ScheduleContext& ctx);
// Check precondition for vectorize schedule
bool ShouldTryVectorization(
const tvm::Tensor& tensor,
ScheduleContext& ctx);
// Check the schedule of tensor
// If it is not scheduled, try to vectorize it.
// Note TryVectorization has to use with compute_root.

View file

@ -38,7 +38,8 @@ static const std::unordered_set<std::string> valid_keys = {
kNupharCacheSoName,
kNupharCacheModelChecksum,
kNupharCacheForceNoJIT,
kNupharCodeGenTarget};
kNupharCodeGenTarget,
kNupharParallelMinWorkloads};
void SetDefaultOptions(std::map<std::string, std::string>& options) {
// create two temporary strings to get rid of the odr-use issue introduced
@ -56,6 +57,20 @@ void SetDefaultOptions(std::map<std::string, std::string>& options) {
std::string cache_so_name_opt(kNupharCacheSoName);
std::string cache_so_name_default(kNupharCacheSoName_Default);
options.insert(std::make_pair(cache_so_name_opt, cache_so_name_default));
std::string parallel_min_workloads_opt(kNupharParallelMinWorkloads);
#if defined(USE_OPENMP) || defined(USE_MKLML)
// a rough estimate of workloads based on static dimensions for each thread, when using parallel schedule
// user may change it to 0 to turn it off,
// or use OMP_NUM_THREADS to control TVM thread pool similar to control MKL
unsigned int parallel_min_workloads_default = 64;
#else
// turn off parallel schedule by default to avoid TVM thread pool confliction with others
// this is to ensure performance when user runs multiple inference threads, with each runs as single thread
// if needed, user can override it with settings, and use TVM_NUM_THREADS to control the thread pool
unsigned int parallel_min_workloads_default = 0;
#endif
options.insert(std::make_pair(parallel_min_workloads_opt, std::to_string(parallel_min_workloads_default)));
}
void CreateNupharCodeGenSettings(const NupharExecutionProviderInfo& info) {

View file

@ -45,6 +45,9 @@ constexpr static const char* kNupharActivations_DeepCpu = "deep_cpu_activation";
// Option to control nuphar code generation target (avx / avx2 / avx512)
constexpr static const char* kNupharCodeGenTarget = "nuphar_codegen_target";
// Option to control nuphar code to run with parallel schedule
constexpr static const char* kNupharParallelMinWorkloads = "nuphar_parallel_min_workloads";
// cache version number (MAJOR.MINOR.PATCH) following https://semver.org/
// 1. MAJOR version when you make incompatible changes that old cache files no longer work,
// 2. MINOR version when you add functionality in a backwards - compatible manner, and

View file

@ -28,7 +28,7 @@ struct NupharCodeGenHandle : codegen::CodeGenHandle {
std::shared_ptr<tvm_codegen::TVMScheduleBuilder> schedule_builder; // keep
// maybe add a layout
tvm_codegen::WeightLayoutRegistry* layout_registry;
bool enable_per_node_parallelized; // TODO: change to config
int64_t parallel_min_workloads;
bool allow_unaligned_buffers; // move to another place

View file

@ -9,6 +9,7 @@
#include "core/providers/nuphar/common/analysis/subgraph_codegen_stats.h"
#include "core/providers/nuphar/compiler/x86/x86_target_info.h"
#include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h"
// TODO change name space
namespace onnxruntime {
@ -37,9 +38,8 @@ static void Traverse(const tvm::Tensor& tensor,
if (is_real_output) {
CodeGenTargetX86* target = dynamic_cast<CodeGenTargetX86*>(ctx_codegen.GetCodeGenHandle()->codegen_target);
ORT_ENFORCE(target != nullptr);
int64_t natural_vector_size = target->NaturalVectorWidth(tensor->dtype.bits());
TryVectorization(tensor, natural_vector_size, ctx_schedule); // to x86
TryVectorizationX86(tensor, ctx_codegen, ctx_schedule);
InsertRootScheduleAndClosure(tensor, ctx_schedule);
}

View file

@ -39,5 +39,13 @@ bool InputRootScheduleWithVectorizationX86(
tvm_codegen::CodeGenContext& ctx_codegen,
tvm_codegen::ScheduleContext& ctx_sched);
bool TryParallelX86(
const tvm::Tensor& tensor,
int64_t to_dim, // fuse dims before to_dim for parallel schedule, 0 to fuse all but last dim
tvm_codegen::CodeGenContext& ctx_codegen,
tvm_codegen::ScheduleContext& ctx_sched);
constexpr auto kNupharScheduleNoParallel = "nuphar_schedule_no_parallel";
} // namespace nuphar
} // namespace onnxruntime

View file

@ -3,26 +3,99 @@
#include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h"
#include "core/codegen/passes/scheduler/schedule_utils.h"
#include "core/framework/op_kernel_info.h"
#include "core/providers/nuphar/common/nuphar_settings.h"
#include "core/providers/nuphar/common/analysis/subgraph_codegen_stats.h"
#include "core/providers/nuphar/compiler/nuphar_codegen_ctx.h"
#include "core/codegen/passes/scheduler/schedule_utils.h"
#include "core/providers/nuphar/compiler/x86/scheduler/tensorize/intrin_gemv_ll_extern.h"
#include "core/providers/nuphar/compiler/x86/scheduler/tensorize/intrin_gemv_ll_ir.h"
#include "core/providers/nuphar/compiler/x86/x86_target_info.h"
#include "core/framework/op_kernel_info.h"
#include <tvm/tvm.h>
#include <tvm/ir_pass.h>
namespace onnxruntime {
namespace nuphar {
bool TryParallelX86(
const tvm::Tensor& tensor,
int64_t to_dim,
tvm_codegen::CodeGenContext& ctx_codegen,
tvm_codegen::ScheduleContext& ctx_sched) {
auto compute_op = tensor->op.as<tvm::ComputeOpNode>();
if (compute_op == nullptr) {
return false;
}
if (compute_op->attrs.count(kNupharScheduleNoParallel)) {
return false;
}
const auto& shape = tensor->shape;
int rank = gsl::narrow<int>(shape.size());
tvm::Array<tvm::IterVar> to_fuse_for_parallel;
int64_t rank_to_parallel = (to_dim ? to_dim : rank - 1);
for (int64_t i = 0; i < rank_to_parallel && i < gsl::narrow<int64_t>(compute_op->axis.size()); ++i) {
tvm::IterVar axis = compute_op->axis[i];
auto dom = axis->dom;
if (!tvm::ir::Equal(dom->extent, shape[i])) {
// only do parallel schedule on axis not being fused or split yet
rank_to_parallel = i;
break;
}
to_fuse_for_parallel.push_back(axis);
}
if (to_fuse_for_parallel.size() < 1) {
return false;
}
int64_t per_thread_static_dims = 1;
for (const auto& reduce_axis : compute_op->reduce_axis) {
const int64_t* static_range = tvm::as_const_int(reduce_axis->dom->extent);
if (static_range != nullptr) {
per_thread_static_dims *= *static_range;
}
}
for (int64_t i = rank_to_parallel; i < rank; ++i) {
auto dim = tvm::as_const_int(shape[i]);
if (dim != nullptr) {
per_thread_static_dims *= *dim;
}
}
// skip small per thread workloads, note that symbolic dims are ignored (treated as 1)
int64_t workloads_threshold = Promote<NupharCodeGenCtx>(&ctx_codegen)->GetCodeGenHandle()->parallel_min_workloads;
if (workloads_threshold <= 0 || per_thread_static_dims < workloads_threshold) {
return false;
}
tvm::IterVar parallel_axis;
if (to_fuse_for_parallel.size() > 1) {
ctx_sched.schedule[tensor->op].fuse(to_fuse_for_parallel, &parallel_axis);
} else {
parallel_axis = to_fuse_for_parallel[0];
}
ctx_sched.schedule[tensor->op].parallel(parallel_axis);
return true;
}
bool TryVectorizationX86(
const tvm::Tensor& tensor,
tvm_codegen::CodeGenContext& ctx_codegen,
tvm_codegen::ScheduleContext& ctx_sched) {
if (!ShouldTryVectorization(tensor, ctx_sched))
return false;
CodeGenTargetX86* target = dynamic_cast<CodeGenTargetX86*>(ctx_codegen.GetCodeGenHandle()->codegen_target);
ORT_ENFORCE(target != nullptr);
int64_t natural_vector_size = target->NaturalVectorWidth(tensor->dtype.bits());
// try to use parallel schedule when vectorizing
// note that we don't do logic-or in return value here
// to make sure vectorization is always tried
TryParallelX86(tensor, 0, ctx_codegen, ctx_sched);
return TryVectorization(tensor, natural_vector_size, ctx_sched);
}
@ -176,7 +249,7 @@ static Status ConvScheduleX86(const tvm::Tensor& tensor,
ctx_sched.schedule[tensor->op].reorder({b, oc_chunk, y, xo, ic_chunk, m, n, ic_block, xi, oc_block});
if (ctx_codegen.GetCodeGenHandle()->enable_per_node_parallelized) {
if (ctx_codegen.GetCodeGenHandle()->parallel_min_workloads > 0) {
tvm::Array<tvm::IterVar> fused_axis;
fused_axis.push_back(b);
fused_axis.push_back(oc_chunk);
@ -186,6 +259,7 @@ static Status ConvScheduleX86(const tvm::Tensor& tensor,
ctx_sched.schedule[tensor->op].fuse(fused_axis, &parallel_axis);
ctx_sched.schedule[tensor->op].parallel(parallel_axis);
}
ctx_sched.schedule[tensor->op].vectorize(oc_block);
return Status::OK();
@ -243,7 +317,7 @@ static Status MatMul_2DWeight_Schedule(
ctx_sched.schedule[CC->op].unroll(ki);
ctx_sched.schedule[CC->op].vectorize(yc);
if (ctx_codegen.GetCodeGenHandle()->enable_per_node_parallelized) {
if (ctx_codegen.GetCodeGenHandle()->parallel_min_workloads > 0) {
// parallelize
tvm::Array<tvm::IterVar> fused_axis;
for (size_t d = 0; d < C_rank - 2; ++d)

View file

@ -22,6 +22,7 @@ bool TVM_SCHEDULER_CLASS(Extern, NupharX86TVMRule)::Evaluate(
static bool ReduceVScheduleNupharX86(
const tvm::Tensor& tensor,
tvm_codegen::CodeGenContext& ctx_codegen,
tvm_codegen::ScheduleContext& ctx_sched) {
InsertRootScheduleAndClosure(tensor, ctx_sched);
@ -55,6 +56,8 @@ static bool ReduceVScheduleNupharX86(
if (shape.size() > 0)
head_dim = as_const_int(shape[0]);
bool try_parallel = true;
// unroll packed reduce by checking head dim
if (nullptr != head_dim) {
// if head_dim is already fused, don't unroll
@ -81,8 +84,13 @@ static bool ReduceVScheduleNupharX86(
ctx_sched.schedule[tensor->op].reorder(reorder_axis);
ctx_sched.schedule[tensor->op].unroll(x0);
try_parallel = false;
}
}
if (try_parallel) {
TryParallelX86(tensor, *fuse_dim, ctx_codegen, ctx_sched);
}
} else if (compute_op->axis.size() > 0 &&
tvm::as_const_int(tensor->shape[0]) != nullptr) {
tvm::IterVar x = compute_op->axis[0];
@ -101,7 +109,7 @@ static bool ReduceVScheduleNupharX86(
bool TVM_SCHEDULER_CLASS(Reduce, NupharX86TVMRule)::Evaluate(
const tvm::Tensor& tensor,
const Node*,
tvm_codegen::CodeGenContext&,
tvm_codegen::CodeGenContext& ctx_codegen,
tvm_codegen::ScheduleContext& ctx_sched) {
// respect topi::kCommReduce
if (tensor->op->tag == topi::kCommReduce) {
@ -109,7 +117,7 @@ bool TVM_SCHEDULER_CLASS(Reduce, NupharX86TVMRule)::Evaluate(
}
if (tensor->op->tag == nuphar::kNupharVReduce) {
return ReduceVScheduleNupharX86(tensor, ctx_sched);
return ReduceVScheduleNupharX86(tensor, ctx_codegen, ctx_sched);
}
// unknown goes to InsertRootScheduleAndClosure

View file

@ -7,6 +7,7 @@
#include "core/codegen/mti/mti_tvm_utils.h"
#include "core/codegen/mti/tensor/pad_ops.h"
#include "core/codegen/mti/tensor/reshape_ops.h"
#include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h"
#include <topi/reduction.h>
namespace onnxruntime {
@ -85,7 +86,7 @@ tvm::Tensor ReduceValueWithoutSplit(const tvm::Tensor& X,
tvm::Map<std::string, tvm::NodeRef> attrs;
attrs.Set(kNupharVReduceFuseDim, tvm::Expr(fuse_dim));
attrs.Set(kNupharScheduleNoParallel, tvm::Expr(true));
return tvm::compute(output_shape, l_out, name + "_regular_reduce", kNupharVReduce, attrs);
}

View file

@ -125,8 +125,8 @@ NupharExecutionProvider::NupharExecutionProvider(const NupharExecutionProviderIn
handle->shape_inference = whole_graph_shape_infer_;
// TODO: remove
handle->enable_per_node_parallelized = info.enable_per_node_parallel;
handle->parallel_min_workloads = std::stoi(settings.GetOptionValue(kNupharParallelMinWorkloads));
// TODO: remove
handle->allow_unaligned_buffers = info.allow_unaligned_buffers; // TODO remove this

View file

@ -31,9 +31,6 @@ constexpr const char* default_nuphar_target_str = stackvm_target_str;
// Information needed to construct Nuphar execution providers.
struct NupharExecutionProviderInfo {
// By default, let provider decide the target by passing in empty string.
bool enable_per_node_parallel; // TODO: remove
// this flag set TVM build_config with data_alignment=1, at the cost of performance
bool allow_unaligned_buffers;
@ -43,10 +40,8 @@ struct NupharExecutionProviderInfo {
std::string settings;
explicit NupharExecutionProviderInfo(bool unaligned_buffers,
const std::string& str_settings = "",
bool per_node_parallel = true)
: enable_per_node_parallel(per_node_parallel),
allow_unaligned_buffers(unaligned_buffers),
const std::string& str_settings = "")
: allow_unaligned_buffers(unaligned_buffers),
settings(str_settings) {}
NupharExecutionProviderInfo() = default;
};

View file

@ -21,7 +21,7 @@ struct NupharExecutionProviderFactory : IExecutionProviderFactory {
};
std::unique_ptr<IExecutionProvider> NupharExecutionProviderFactory::CreateProvider() {
NupharExecutionProviderInfo info(allow_unaligned_buffers_, settings_, /*per_node_parallel*/ true);
NupharExecutionProviderInfo info(allow_unaligned_buffers_, settings_);
return onnxruntime::make_unique<NupharExecutionProvider>(info);
}