Added support for ReduceMean on DNNL EP for CPU and GPU (#7902)

* Added support for ReduceMean on DNNL EP for CPU and GPU

Signed-off-by: Chethan Palangotu Keshava <chethan.palangotu.keshava@intel.com>

* Added fix for a resnet model failure where it was failing to create dst shape for reducemean when it was part of a subgraph with other ops

Signed-off-by: Chethan Palangotu Keshava <chethan.palangotu.keshava@intel.com>

* Removing the DNNL EP from these unit tests. This is in anticipation of two changes:
- DNNL EP unit tests would be added in a different location later on, so addition of EP individually to these tests will not be necessary
- This was causing a memory leak fail in debug build. The bug is in the EP itself and not in the code added for reducemean. The fix for this is in the i/o handling overhaul which will be added later.

* Update reduction_ops_test.cc

Had accidentally deleted a new line. Making sure there are no unnecessary changes in this file
This commit is contained in:
chethanpk 2021-06-21 17:15:46 -07:00 committed by GitHub
parent 352d560fd5
commit 3cd06cb38c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 271 additions and 15 deletions

View file

@ -13,6 +13,7 @@ DnnlOpManager::DnnlOpManager() {
dnnl_ops_map_.emplace(std::make_pair("LRN", std::unique_ptr<DnnlNodeCapability>(new DnnlDefaultNodeCapability())));
dnnl_ops_map_.emplace(std::make_pair("MatMul", std::unique_ptr<DnnlNodeCapability>(new DnnlMatMulNodeCapability())));
dnnl_ops_map_.emplace(std::make_pair("MaxPool", std::unique_ptr<DnnlNodeCapability>(new DnnlPoolNodeCapability())));
dnnl_ops_map_.emplace(std::make_pair("ReduceMean", std::unique_ptr<DnnlNodeCapability>(new DnnlReduceMeanNodeCapability())));
dnnl_ops_map_.emplace(std::make_pair("Relu", std::unique_ptr<DnnlNodeCapability>(new DnnlDefaultNodeCapability())));
dnnl_ops_map_.emplace(std::make_pair("Sum", std::unique_ptr<DnnlNodeCapability>(new DnnlDefaultNodeCapability())));
#if defined(ENABLE_TRAINING)

View file

@ -15,6 +15,7 @@
#include "core/providers/dnnl/subgraph/dnnl_pool.h"
#include "core/providers/dnnl/subgraph/dnnl_sum.h"
#include "core/providers/dnnl/subgraph/dnnl_lrn.h"
#include "core/providers/dnnl/subgraph/dnnl_reducemean.h"
#include "core/providers/dnnl/subgraph/dnnl_matmul.h"
#ifdef ENABLE_TRAINING
#include "core/providers/dnnl/subgraph/dnnl_convgrad.h"
@ -108,6 +109,15 @@ class SubgraphPrimitive : public PrimitiveBase {
kernel->parents_.push_back(context_.kernels[index]);
}
context_.kernels.push_back(kernel);
} else if (dnnl_node.name == "ReduceMean") {
std::ostringstream os;
os << "ReduceMean-" << dnnl_node.node_index << "-";
std::shared_ptr<DnnlReduceMean<T>> kernel;
kernel = std::make_shared<DnnlReduceMean<T>>(dnnl_node, params.provider, *params.attributes, os.str());
for (auto index : dnnl_node.parent_nodes) {
kernel->parents_.push_back(context_.kernels[index]);
}
context_.kernels.push_back(kernel);
} else if (dnnl_node.name == "BatchNormalization") {
std::ostringstream os;
os << "BatchNormalization-" << dnnl_node.node_index << "-";
@ -340,24 +350,35 @@ template <typename T>
Status DnnlFuncKernel<T>::Compute(const OrtCustomOpApi* api, OrtKernelContext* context) const {
Status status;
try {
// The training runner sets up the training graph then calls it via the inferance runner using a new thread
// each call. Since the SubgraphPrimitivePool stashes the nodes based on the thread_local memory it results in a new
// stash being created per-call from the training loop. In theory the thread_local memory should be freed when the calling
// thread is destroyed but this was not being seen when actually running the code. Instead of relying on the thread_local
// memory being freed we name a new SubgraphPrimitive instead of using the SubgraphPrimitivePool when the code is built for
// training. (If the training running is updated to use a thread pool instead of a new thread each run we may be able to
// revert back to the SubgraphPrimitivePool.)
// The training runner sets up the training graph then calls it via the inferance runner using a new thread
// each call. Since the SubgraphPrimitivePool stashes the nodes based on the thread_local memory it results in a new
// stash being created per-call from the training loop. In theory the thread_local memory should be freed when the calling
// thread is destroyed but this was not being seen when actually running the code. Instead of relying on the thread_local
// memory being freed we name a new SubgraphPrimitive instead of using the SubgraphPrimitivePool when the code is built for
// training. (If the training running is updated to use a thread pool instead of a new thread each run we may be able to
// revert back to the SubgraphPrimitivePool.)
#ifdef ENABLE_TRAINING
std::unique_ptr<SubgraphPrimitive<T>> primitive = std::make_unique<SubgraphPrimitive<T>>(api, context, params_);
std::unique_ptr<SubgraphPrimitive<T>> primitive = std::make_unique<SubgraphPrimitive<T>>(api, context, params_);
primitive->UpdateProvider(params_);
status = primitive->Compute(api, context);
#else
SubgraphPrimitive<T>* primitive = SubgraphPrimitivePool<T>::Get(api, context, params_);
#endif // ENABLE_TRAINING
primitive->UpdateProvider(params_);
status = primitive->Compute(api, context);
std::string subgraph_key = params_.subgraph_key;
if (subgraph_key.find("ReduceMean") != std::string::npos) {
std::unique_ptr<SubgraphPrimitive<T>> primitive = std::make_unique<SubgraphPrimitive<T>>(api, context, params_);
primitive->UpdateProvider(params_);
status = primitive->Compute(api, context);
}
else
{
SubgraphPrimitive<T>* primitive = SubgraphPrimitivePool<T>::Get(api, context, params_);
primitive->UpdateProvider(params_);
status = primitive->Compute(api, context);
}
#endif
} catch (const dnnl::error& e) {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Status: ", e.status, ", message: ", e.what());
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Status: ", e.status, ", message: ", e.what());
}
return status;
}

View file

@ -0,0 +1,234 @@
// Copyright(C) 2019 Intel Corporation
// Licensed under the MIT License
#pragma once
#include "core/providers/dnnl/dnnl_fwd.h"
#include "core/providers/dnnl/dnnl_execution_provider.h"
#include "core/providers/dnnl/subgraph/dnnl_kernel.h"
#include <vector>
namespace onnxruntime {
namespace ort_dnnl {
template <typename T>
class DnnlReduceMean : public DnnlKernel {
public:
DnnlReduceMean(const DnnlNode& node,
DNNLExecutionProvider* provider,
const NodeAttributes& attributes,
const std::string attributes_prefix = "") : DnnlKernel(node, provider) {
ReadAttributes(attributes, attributes_prefix);
ORT_UNUSED_PARAMETER(attributes);
ORT_UNUSED_PARAMETER(attributes_prefix);
}
void ReadAttributes(const NodeAttributes& attributes,
const std::string attributes_prefix = "") override {
auto attr = attributes.find(attributes_prefix + "keepdims");
if (attr != attributes.end() &&
attr->second().type() == ::ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT) {
keepdims_attr_ = attr->second().i();
}
auto attr2 = attributes.find(attributes_prefix + "axes");
if (attr2 != attributes.end()) {
auto& proto = attr2->second();
GetIntsAttr(proto, axes_attr_);
}
}
void CreatePrimitives(const OrtCustomOpApi* api,
OrtKernelContext* context,
const std::unordered_map<dnnl::engine::kind, dnnl::engine>& dnnl_engine,
std::vector<dnnl::primitive>& net,
std::vector<std::unordered_map<int, dnnl::memory>>& net_args) {
dnnl::engine cpu_engine;
dnnl::engine engine_to_use;
std::unordered_map<dnnl::engine::kind, dnnl::engine>::const_iterator iter = dnnl_engine.find(dnnl::engine::kind::cpu);
if (iter != dnnl_engine.end()) {
cpu_engine = (dnnl::engine)iter->second;
engine_to_use = cpu_engine;
}
gpu_available_ = false;
dnnl::engine gpu_engine;
iter = dnnl_engine.find(dnnl::engine::kind::gpu);
if (iter != dnnl_engine.end()) {
gpu_engine = (dnnl::engine)(iter->second);
gpu_available_ = true;
engine_to_use = gpu_engine;
LOGS_DEFAULT(INFO) << "gpu engine found" << std::endl;
}
Ort::CustomOpApi ort{*api};
int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index;
TensorShape x_shape;
if (mklnode_ptr_->parent_nodes.empty()) {
const OrtValue* input_tensor = ort.KernelContext_GetInput(context, input_index);
auto tensor_info = ort.GetTensorTypeAndShape(input_tensor);
auto tensor_shape = ort.GetTensorShape(tensor_info);
ort.ReleaseTensorTypeAndShapeInfo(tensor_info);
auto xshape = tensor_shape.data();
auto xdim = tensor_shape.size();
dnnl::memory::dims dims(xdim);
ort_source_format_ = GetSourceFormat(static_cast<int>(xdim));
x_shape = TensorShape(xshape, xdim);
if (x_shape.NumDimensions() == 0) {
primitive_created_status_ = Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Shape of size zero " + x_shape.ToString());
return;
}
dnnl::memory::dims src_dims(
x_shape.GetDims().begin(), x_shape.GetDims().end());
ort_source_desc_ = dnnl::memory::desc(
{src_dims}, DnnnType<T>(), ort_source_format_);
source_desc_ = ort_source_desc_;
src_md_ = std::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({src_dims}, DnnnType<T>(), ort_source_format_));
src_mem_ = std::make_unique<dnnl::memory>(
dnnl::memory({{src_dims}, DnnnType<T>(), ort_source_format_}, cpu_engine, nullptr));
if (gpu_available_) {
src_mem_gpu_ = std::make_unique<dnnl::memory>(*src_md_, gpu_engine);
net.push_back(mkldnn::reorder(*src_mem_, *src_mem_gpu_));
net_args.push_back({{MKLDNN_ARG_SRC, *src_mem_},
{MKLDNN_ARG_DST, *src_mem_gpu_}});
}
} else {
src_md_ = std::make_unique<dnnl::memory::desc>(
dnnl::memory::desc(parents_[0].get()->primitive_dst_desc_));
if (!gpu_available_) {
src_mem_ = parents_[0].get()->primitive_dst_mem_;
} else { // gpu_available_
src_mem_gpu_ = parents_[0].get()->primitive_dst_mem_;
}
x_shape = parents_[0].get()->primitive_dst_shape_;
ort_source_format_ = parents_[0].get()->ort_source_format_;
ort_source_desc_ = parents_[0].get()->ort_source_desc_;
source_desc_ = parents_[0].get()->primitive_dst_desc_;
}
//We need to calculate output tensor shape
//First we initialize it with input shape and then we modify it based on the attribute values
//This is because the attribute values decide the output shape
auto xshape = x_shape.GetDims();
auto ndim = x_shape.NumDimensions();
for (unsigned long int i = 0; i < ndim; i++) {
if (axes_attr_.size() == 0)
xshape[i] = 1; //If no axis is specified, then output shape is just all 1's
else if (i < axes_attr_.size()) {
if (axes_attr_[i] < 0)
xshape[ndim + axes_attr_[i]] = 1;
else
xshape[axes_attr_[i]] = 1;
} //If there is axis, then make the respective dimensions 1, keeping the other dimension values untouched.
}
primitive_dst_shape_ = TensorShape(xshape.data(), ndim);
dnnl::memory::dims dst_dims_mkl(primitive_dst_shape_.GetDims().begin(), primitive_dst_shape_.GetDims().end());
primitive_dst_md_ = std::make_unique<dnnl::memory::desc>(
dnnl::memory::desc({ dst_dims_mkl }, DnnnType<T>(), dnnl::memory::format_tag::any));
// Create operation descriptor.
std::unique_ptr<dnnl::reduction::desc> fwd_desc_ = std::make_unique<dnnl::reduction::desc>(dnnl::reduction::desc(
dnnl::algorithm::reduction_mean, *src_md_, *primitive_dst_md_, 0.f, 0.f));
// Create primitive descriptor.
std::unique_ptr<dnnl::reduction::primitive_desc> reducemean_fwd_pd_ = std::make_unique<dnnl::reduction::primitive_desc>(dnnl::reduction::primitive_desc(*fwd_desc_, engine_to_use));
// Create the primitive.
std::unique_ptr<dnnl::primitive> reducemean_fwd_ = std::make_unique<dnnl::reduction>(dnnl::reduction(*reducemean_fwd_pd_));
primitive_src_desc_ = reducemean_fwd_pd_.get()->src_desc();
primitive_dst_desc_ = reducemean_fwd_pd_.get()->dst_desc();
if (!gpu_available_) {
if (mklnode_ptr_->output_index >= 0) {
// last node of sub-graph. need to allocate memory for output_tensor
if (primitive_dst_desc_ != ort_source_desc_) {
// reorder neded. Use primitive output as input to reorder and
// allocate buffer for reorder output, final output of this subgraph
primitive_dst_mem_ = std::make_shared<dnnl::memory>(dnnl::memory(reducemean_fwd_pd_.get()->dst_desc(), cpu_engine));
} else {
// Last node but re-order not needed. Allocate buffer to output of this node
primitive_dst_mem_ = std::make_shared<dnnl::memory>(dnnl::memory(reducemean_fwd_pd_.get()->dst_desc(), cpu_engine, nullptr));
}
} else {
// Intermediate node. Use dnnl kernel internal memory for output and
// use this as input to next node.
primitive_dst_mem_ = std::make_shared<dnnl::memory>(dnnl::memory(reducemean_fwd_pd_.get()->dst_desc(), cpu_engine));
}
} else { // gpu_available_
primitive_dst_mem_ = std::make_shared<dnnl::memory>(dnnl::memory(reducemean_fwd_pd_.get()->dst_desc(), gpu_engine));
}
if (!gpu_available_) {
net.push_back(*reducemean_fwd_);
net_args.push_back({{DNNL_ARG_SRC, *src_mem_},
{DNNL_ARG_DST, *primitive_dst_mem_}});
} else { // gpu_available_
net.push_back(*reducemean_fwd_);
net_args.push_back({{DNNL_ARG_SRC, *src_mem_gpu_},
{DNNL_ARG_DST, *primitive_dst_mem_}});
}
if (mklnode_ptr_->output_index >= 0) {
// one of the end nodes. Allocate output buffer memory and
// reorder is necessary
dnnl::memory::data_type t = DnnnType<T>();
InitDstReorderOutput(cpu_engine, t, net, net_args, gpu_available_);
}
}
Status Bind(const OrtCustomOpApi* api, OrtKernelContext* context) override {
Ort::CustomOpApi ort{*api};
ORT_RETURN_IF_ERROR(primitive_created_status_);
int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index;
if (mklnode_ptr_->parent_nodes.empty()) {
// Sub-graph's first node. Read input from input buffer
const OrtValue* input_tensor = ort.KernelContext_GetInput(context, input_index);
const T* src_data = const_cast<T*>(ort.GetTensorData<T>(input_tensor));
src_mem_->set_data_handle(static_cast<void*>(const_cast<T*>(src_data)));
}
if (mklnode_ptr_->output_index >= 0) {
auto& y_dims = primitive_dst_shape_.GetDims();
// Allocate memory for output bufffer
OrtValue* output = ort.KernelContext_GetOutput(context, mklnode_ptr_->output_index, &y_dims[0], static_cast<int>(primitive_dst_shape_.GetDims().size()));
T* dst_data = ort.GetTensorMutableData<T>(output);
if (!gpu_available_) {
if (primitive_dst_desc_ != ort_source_desc_) {
reorder_dst_mem_to_->set_data_handle(dst_data);
} else {
primitive_dst_mem_->set_data_handle(dst_data);
}
} else { // gpu_available_
reorder_dst_mem_to_->set_data_handle(dst_data);
}
}
return Status::OK();
}
private:
std::shared_ptr<dnnl::memory> src_mem_;
std::shared_ptr<dnnl::memory> src_mem_gpu_;
//std::unique_ptr<dnnl::reduction::desc> fwd_desc_;
//std::unique_ptr<dnnl::reduction::primitive_desc> reducemean_fwd_pd_;
//std::unique_ptr<dnnl::primitive> reducemean_fwd_;
std::unique_ptr<dnnl::memory::desc> src_md_;
int64_t keepdims_attr_;
std::vector<int64_t> axes_attr_;
bool gpu_available_;
};
} // namespace ort_dnnl
} // namespace onnxruntime

View file

@ -1030,7 +1030,7 @@ TEST(ReductionOpTest, ReduceMean) {
9.0f, 10.0f,
11.0f, 12.0f});
test.AddOutput<float>("reduced", {1, 2, 1}, {5.5f, 7.5f});
test.Run();
}