From 3cd06cb38cd5e76619dbd7a2859897ab9cdd2d8c Mon Sep 17 00:00:00 2001 From: chethanpk <63478277+chethanpk@users.noreply.github.com> Date: Mon, 21 Jun 2021 17:15:46 -0700 Subject: [PATCH] Added support for ReduceMean on DNNL EP for CPU and GPU (#7902) * Added support for ReduceMean on DNNL EP for CPU and GPU Signed-off-by: Chethan Palangotu Keshava * Added fix for a resnet model failure where it was failing to create dst shape for reducemean when it was part of a subgraph with other ops Signed-off-by: Chethan Palangotu Keshava * Removing the DNNL EP from these unit tests. This is in anticipation of two changes: - DNNL EP unit tests would be added in a different location later on, so addition of EP individually to these tests will not be necessary - This was causing a memory leak fail in debug build. The bug is in the EP itself and not in the code added for reducemean. The fix for this is in the i/o handling overhaul which will be added later. * Update reduction_ops_test.cc Had accidentally deleted a new line. Making sure there are no unnecessary changes in this file --- .../core/providers/dnnl/dnnl_op_manager.cc | 1 + .../dnnl/subgraph/dnnl_func_kernel.cc | 49 ++-- .../providers/dnnl/subgraph/dnnl_reducemean.h | 234 ++++++++++++++++++ .../cpu/reduction/reduction_ops_test.cc | 2 +- 4 files changed, 271 insertions(+), 15 deletions(-) create mode 100644 onnxruntime/core/providers/dnnl/subgraph/dnnl_reducemean.h diff --git a/onnxruntime/core/providers/dnnl/dnnl_op_manager.cc b/onnxruntime/core/providers/dnnl/dnnl_op_manager.cc index 0aac1c8c86..24bb06a69b 100644 --- a/onnxruntime/core/providers/dnnl/dnnl_op_manager.cc +++ b/onnxruntime/core/providers/dnnl/dnnl_op_manager.cc @@ -13,6 +13,7 @@ DnnlOpManager::DnnlOpManager() { dnnl_ops_map_.emplace(std::make_pair("LRN", std::unique_ptr(new DnnlDefaultNodeCapability()))); dnnl_ops_map_.emplace(std::make_pair("MatMul", std::unique_ptr(new DnnlMatMulNodeCapability()))); dnnl_ops_map_.emplace(std::make_pair("MaxPool", std::unique_ptr(new DnnlPoolNodeCapability()))); + dnnl_ops_map_.emplace(std::make_pair("ReduceMean", std::unique_ptr(new DnnlReduceMeanNodeCapability()))); dnnl_ops_map_.emplace(std::make_pair("Relu", std::unique_ptr(new DnnlDefaultNodeCapability()))); dnnl_ops_map_.emplace(std::make_pair("Sum", std::unique_ptr(new DnnlDefaultNodeCapability()))); #if defined(ENABLE_TRAINING) diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_func_kernel.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_func_kernel.cc index 5e330725db..f6c4da8e3a 100644 --- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_func_kernel.cc +++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_func_kernel.cc @@ -15,6 +15,7 @@ #include "core/providers/dnnl/subgraph/dnnl_pool.h" #include "core/providers/dnnl/subgraph/dnnl_sum.h" #include "core/providers/dnnl/subgraph/dnnl_lrn.h" +#include "core/providers/dnnl/subgraph/dnnl_reducemean.h" #include "core/providers/dnnl/subgraph/dnnl_matmul.h" #ifdef ENABLE_TRAINING #include "core/providers/dnnl/subgraph/dnnl_convgrad.h" @@ -108,6 +109,15 @@ class SubgraphPrimitive : public PrimitiveBase { kernel->parents_.push_back(context_.kernels[index]); } context_.kernels.push_back(kernel); + } else if (dnnl_node.name == "ReduceMean") { + std::ostringstream os; + os << "ReduceMean-" << dnnl_node.node_index << "-"; + std::shared_ptr> kernel; + kernel = std::make_shared>(dnnl_node, params.provider, *params.attributes, os.str()); + for (auto index : dnnl_node.parent_nodes) { + kernel->parents_.push_back(context_.kernels[index]); + } + context_.kernels.push_back(kernel); } else if (dnnl_node.name == "BatchNormalization") { std::ostringstream os; os << "BatchNormalization-" << dnnl_node.node_index << "-"; @@ -340,24 +350,35 @@ template Status DnnlFuncKernel::Compute(const OrtCustomOpApi* api, OrtKernelContext* context) const { Status status; try { - // The training runner sets up the training graph then calls it via the inferance runner using a new thread - // each call. Since the SubgraphPrimitivePool stashes the nodes based on the thread_local memory it results in a new - // stash being created per-call from the training loop. In theory the thread_local memory should be freed when the calling - // thread is destroyed but this was not being seen when actually running the code. Instead of relying on the thread_local - // memory being freed we name a new SubgraphPrimitive instead of using the SubgraphPrimitivePool when the code is built for - // training. (If the training running is updated to use a thread pool instead of a new thread each run we may be able to - // revert back to the SubgraphPrimitivePool.) + // The training runner sets up the training graph then calls it via the inferance runner using a new thread + // each call. Since the SubgraphPrimitivePool stashes the nodes based on the thread_local memory it results in a new + // stash being created per-call from the training loop. In theory the thread_local memory should be freed when the calling + // thread is destroyed but this was not being seen when actually running the code. Instead of relying on the thread_local + // memory being freed we name a new SubgraphPrimitive instead of using the SubgraphPrimitivePool when the code is built for + // training. (If the training running is updated to use a thread pool instead of a new thread each run we may be able to + // revert back to the SubgraphPrimitivePool.) #ifdef ENABLE_TRAINING - std::unique_ptr> primitive = std::make_unique>(api, context, params_); + std::unique_ptr> primitive = std::make_unique>(api, context, params_); + primitive->UpdateProvider(params_); + status = primitive->Compute(api, context); #else - SubgraphPrimitive* primitive = SubgraphPrimitivePool::Get(api, context, params_); -#endif // ENABLE_TRAINING - - primitive->UpdateProvider(params_); - status = primitive->Compute(api, context); + std::string subgraph_key = params_.subgraph_key; + if (subgraph_key.find("ReduceMean") != std::string::npos) { + std::unique_ptr> primitive = std::make_unique>(api, context, params_); + primitive->UpdateProvider(params_); + status = primitive->Compute(api, context); + } + else + { + SubgraphPrimitive* primitive = SubgraphPrimitivePool::Get(api, context, params_); + primitive->UpdateProvider(params_); + status = primitive->Compute(api, context); + } +#endif } catch (const dnnl::error& e) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Status: ", e.status, ", message: ", e.what()); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Status: ", e.status, ", message: ", e.what()); } + return status; } diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_reducemean.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_reducemean.h new file mode 100644 index 0000000000..d3026c38d9 --- /dev/null +++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_reducemean.h @@ -0,0 +1,234 @@ +// Copyright(C) 2019 Intel Corporation +// Licensed under the MIT License + +#pragma once +#include "core/providers/dnnl/dnnl_fwd.h" +#include "core/providers/dnnl/dnnl_execution_provider.h" +#include "core/providers/dnnl/subgraph/dnnl_kernel.h" +#include + +namespace onnxruntime { +namespace ort_dnnl { + +template +class DnnlReduceMean : public DnnlKernel { + public: + DnnlReduceMean(const DnnlNode& node, + DNNLExecutionProvider* provider, + const NodeAttributes& attributes, + const std::string attributes_prefix = "") : DnnlKernel(node, provider) { + ReadAttributes(attributes, attributes_prefix); + ORT_UNUSED_PARAMETER(attributes); + ORT_UNUSED_PARAMETER(attributes_prefix); + } + + void ReadAttributes(const NodeAttributes& attributes, + const std::string attributes_prefix = "") override { + auto attr = attributes.find(attributes_prefix + "keepdims"); + if (attr != attributes.end() && + attr->second().type() == ::ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT) { + keepdims_attr_ = attr->second().i(); + } + auto attr2 = attributes.find(attributes_prefix + "axes"); + if (attr2 != attributes.end()) { + auto& proto = attr2->second(); + GetIntsAttr(proto, axes_attr_); + } + } + + void CreatePrimitives(const OrtCustomOpApi* api, + OrtKernelContext* context, + const std::unordered_map& dnnl_engine, + std::vector& net, + std::vector>& net_args) { + dnnl::engine cpu_engine; + dnnl::engine engine_to_use; + std::unordered_map::const_iterator iter = dnnl_engine.find(dnnl::engine::kind::cpu); + if (iter != dnnl_engine.end()) { + cpu_engine = (dnnl::engine)iter->second; + engine_to_use = cpu_engine; + } + gpu_available_ = false; + dnnl::engine gpu_engine; + iter = dnnl_engine.find(dnnl::engine::kind::gpu); + if (iter != dnnl_engine.end()) { + gpu_engine = (dnnl::engine)(iter->second); + gpu_available_ = true; + engine_to_use = gpu_engine; + LOGS_DEFAULT(INFO) << "gpu engine found" << std::endl; + } + Ort::CustomOpApi ort{*api}; + int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index; + + TensorShape x_shape; + if (mklnode_ptr_->parent_nodes.empty()) { + const OrtValue* input_tensor = ort.KernelContext_GetInput(context, input_index); + auto tensor_info = ort.GetTensorTypeAndShape(input_tensor); + auto tensor_shape = ort.GetTensorShape(tensor_info); + ort.ReleaseTensorTypeAndShapeInfo(tensor_info); + + auto xshape = tensor_shape.data(); + auto xdim = tensor_shape.size(); + + dnnl::memory::dims dims(xdim); + ort_source_format_ = GetSourceFormat(static_cast(xdim)); + x_shape = TensorShape(xshape, xdim); + + if (x_shape.NumDimensions() == 0) { + primitive_created_status_ = Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Shape of size zero " + x_shape.ToString()); + return; + } + + dnnl::memory::dims src_dims( + x_shape.GetDims().begin(), x_shape.GetDims().end()); + + ort_source_desc_ = dnnl::memory::desc( + {src_dims}, DnnnType(), ort_source_format_); + source_desc_ = ort_source_desc_; + src_md_ = std::make_unique( + dnnl::memory::desc({src_dims}, DnnnType(), ort_source_format_)); + src_mem_ = std::make_unique( + dnnl::memory({{src_dims}, DnnnType(), ort_source_format_}, cpu_engine, nullptr)); + if (gpu_available_) { + src_mem_gpu_ = std::make_unique(*src_md_, gpu_engine); + net.push_back(mkldnn::reorder(*src_mem_, *src_mem_gpu_)); + net_args.push_back({{MKLDNN_ARG_SRC, *src_mem_}, + {MKLDNN_ARG_DST, *src_mem_gpu_}}); + } + } else { + src_md_ = std::make_unique( + dnnl::memory::desc(parents_[0].get()->primitive_dst_desc_)); + if (!gpu_available_) { + src_mem_ = parents_[0].get()->primitive_dst_mem_; + } else { // gpu_available_ + src_mem_gpu_ = parents_[0].get()->primitive_dst_mem_; + } + x_shape = parents_[0].get()->primitive_dst_shape_; + ort_source_format_ = parents_[0].get()->ort_source_format_; + ort_source_desc_ = parents_[0].get()->ort_source_desc_; + source_desc_ = parents_[0].get()->primitive_dst_desc_; + } + //We need to calculate output tensor shape + //First we initialize it with input shape and then we modify it based on the attribute values + //This is because the attribute values decide the output shape + + auto xshape = x_shape.GetDims(); + auto ndim = x_shape.NumDimensions(); + for (unsigned long int i = 0; i < ndim; i++) { + if (axes_attr_.size() == 0) + xshape[i] = 1; //If no axis is specified, then output shape is just all 1's + else if (i < axes_attr_.size()) { + if (axes_attr_[i] < 0) + xshape[ndim + axes_attr_[i]] = 1; + else + xshape[axes_attr_[i]] = 1; + } //If there is axis, then make the respective dimensions 1, keeping the other dimension values untouched. + } + primitive_dst_shape_ = TensorShape(xshape.data(), ndim); + + dnnl::memory::dims dst_dims_mkl(primitive_dst_shape_.GetDims().begin(), primitive_dst_shape_.GetDims().end()); + + primitive_dst_md_ = std::make_unique( + dnnl::memory::desc({ dst_dims_mkl }, DnnnType(), dnnl::memory::format_tag::any)); + + // Create operation descriptor. + std::unique_ptr fwd_desc_ = std::make_unique(dnnl::reduction::desc( + dnnl::algorithm::reduction_mean, *src_md_, *primitive_dst_md_, 0.f, 0.f)); + // Create primitive descriptor. + std::unique_ptr reducemean_fwd_pd_ = std::make_unique(dnnl::reduction::primitive_desc(*fwd_desc_, engine_to_use)); + // Create the primitive. + std::unique_ptr reducemean_fwd_ = std::make_unique(dnnl::reduction(*reducemean_fwd_pd_)); + + primitive_src_desc_ = reducemean_fwd_pd_.get()->src_desc(); + primitive_dst_desc_ = reducemean_fwd_pd_.get()->dst_desc(); + + if (!gpu_available_) { + if (mklnode_ptr_->output_index >= 0) { + // last node of sub-graph. need to allocate memory for output_tensor + if (primitive_dst_desc_ != ort_source_desc_) { + // reorder neded. Use primitive output as input to reorder and + // allocate buffer for reorder output, final output of this subgraph + primitive_dst_mem_ = std::make_shared(dnnl::memory(reducemean_fwd_pd_.get()->dst_desc(), cpu_engine)); + } else { + // Last node but re-order not needed. Allocate buffer to output of this node + primitive_dst_mem_ = std::make_shared(dnnl::memory(reducemean_fwd_pd_.get()->dst_desc(), cpu_engine, nullptr)); + } + } else { + // Intermediate node. Use dnnl kernel internal memory for output and + // use this as input to next node. + primitive_dst_mem_ = std::make_shared(dnnl::memory(reducemean_fwd_pd_.get()->dst_desc(), cpu_engine)); + } + } else { // gpu_available_ + primitive_dst_mem_ = std::make_shared(dnnl::memory(reducemean_fwd_pd_.get()->dst_desc(), gpu_engine)); + } + + + if (!gpu_available_) { + net.push_back(*reducemean_fwd_); + net_args.push_back({{DNNL_ARG_SRC, *src_mem_}, + {DNNL_ARG_DST, *primitive_dst_mem_}}); + } else { // gpu_available_ + net.push_back(*reducemean_fwd_); + net_args.push_back({{DNNL_ARG_SRC, *src_mem_gpu_}, + {DNNL_ARG_DST, *primitive_dst_mem_}}); + } + + if (mklnode_ptr_->output_index >= 0) { + // one of the end nodes. Allocate output buffer memory and + // reorder is necessary + dnnl::memory::data_type t = DnnnType(); + InitDstReorderOutput(cpu_engine, t, net, net_args, gpu_available_); + } + } + + Status Bind(const OrtCustomOpApi* api, OrtKernelContext* context) override { + Ort::CustomOpApi ort{*api}; + + ORT_RETURN_IF_ERROR(primitive_created_status_); + + int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index; + + if (mklnode_ptr_->parent_nodes.empty()) { + // Sub-graph's first node. Read input from input buffer + const OrtValue* input_tensor = ort.KernelContext_GetInput(context, input_index); + const T* src_data = const_cast(ort.GetTensorData(input_tensor)); + src_mem_->set_data_handle(static_cast(const_cast(src_data))); + } + + if (mklnode_ptr_->output_index >= 0) { + auto& y_dims = primitive_dst_shape_.GetDims(); + // Allocate memory for output bufffer + OrtValue* output = ort.KernelContext_GetOutput(context, mklnode_ptr_->output_index, &y_dims[0], static_cast(primitive_dst_shape_.GetDims().size())); + T* dst_data = ort.GetTensorMutableData(output); + + if (!gpu_available_) { + if (primitive_dst_desc_ != ort_source_desc_) { + reorder_dst_mem_to_->set_data_handle(dst_data); + } else { + primitive_dst_mem_->set_data_handle(dst_data); + } + } else { // gpu_available_ + reorder_dst_mem_to_->set_data_handle(dst_data); + } + } + + return Status::OK(); + } + + private: + std::shared_ptr src_mem_; + std::shared_ptr src_mem_gpu_; + + //std::unique_ptr fwd_desc_; + //std::unique_ptr reducemean_fwd_pd_; + //std::unique_ptr reducemean_fwd_; + + std::unique_ptr src_md_; + + int64_t keepdims_attr_; + std::vector axes_attr_; + + bool gpu_available_; +}; +} // namespace ort_dnnl +} // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index 01b0671261..ed896269fd 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -1030,7 +1030,7 @@ TEST(ReductionOpTest, ReduceMean) { 9.0f, 10.0f, 11.0f, 12.0f}); test.AddOutput("reduced", {1, 2, 1}, {5.5f, 7.5f}); - + test.Run(); }