mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-01 23:30:35 +00:00
Added support for ReduceMean on DNNL EP for CPU and GPU (#7902)
* Added support for ReduceMean on DNNL EP for CPU and GPU Signed-off-by: Chethan Palangotu Keshava <chethan.palangotu.keshava@intel.com> * Added fix for a resnet model failure where it was failing to create dst shape for reducemean when it was part of a subgraph with other ops Signed-off-by: Chethan Palangotu Keshava <chethan.palangotu.keshava@intel.com> * Removing the DNNL EP from these unit tests. This is in anticipation of two changes: - DNNL EP unit tests would be added in a different location later on, so addition of EP individually to these tests will not be necessary - This was causing a memory leak fail in debug build. The bug is in the EP itself and not in the code added for reducemean. The fix for this is in the i/o handling overhaul which will be added later. * Update reduction_ops_test.cc Had accidentally deleted a new line. Making sure there are no unnecessary changes in this file
This commit is contained in:
parent
352d560fd5
commit
3cd06cb38c
4 changed files with 271 additions and 15 deletions
|
|
@ -13,6 +13,7 @@ DnnlOpManager::DnnlOpManager() {
|
|||
dnnl_ops_map_.emplace(std::make_pair("LRN", std::unique_ptr<DnnlNodeCapability>(new DnnlDefaultNodeCapability())));
|
||||
dnnl_ops_map_.emplace(std::make_pair("MatMul", std::unique_ptr<DnnlNodeCapability>(new DnnlMatMulNodeCapability())));
|
||||
dnnl_ops_map_.emplace(std::make_pair("MaxPool", std::unique_ptr<DnnlNodeCapability>(new DnnlPoolNodeCapability())));
|
||||
dnnl_ops_map_.emplace(std::make_pair("ReduceMean", std::unique_ptr<DnnlNodeCapability>(new DnnlReduceMeanNodeCapability())));
|
||||
dnnl_ops_map_.emplace(std::make_pair("Relu", std::unique_ptr<DnnlNodeCapability>(new DnnlDefaultNodeCapability())));
|
||||
dnnl_ops_map_.emplace(std::make_pair("Sum", std::unique_ptr<DnnlNodeCapability>(new DnnlDefaultNodeCapability())));
|
||||
#if defined(ENABLE_TRAINING)
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@
|
|||
#include "core/providers/dnnl/subgraph/dnnl_pool.h"
|
||||
#include "core/providers/dnnl/subgraph/dnnl_sum.h"
|
||||
#include "core/providers/dnnl/subgraph/dnnl_lrn.h"
|
||||
#include "core/providers/dnnl/subgraph/dnnl_reducemean.h"
|
||||
#include "core/providers/dnnl/subgraph/dnnl_matmul.h"
|
||||
#ifdef ENABLE_TRAINING
|
||||
#include "core/providers/dnnl/subgraph/dnnl_convgrad.h"
|
||||
|
|
@ -108,6 +109,15 @@ class SubgraphPrimitive : public PrimitiveBase {
|
|||
kernel->parents_.push_back(context_.kernels[index]);
|
||||
}
|
||||
context_.kernels.push_back(kernel);
|
||||
} else if (dnnl_node.name == "ReduceMean") {
|
||||
std::ostringstream os;
|
||||
os << "ReduceMean-" << dnnl_node.node_index << "-";
|
||||
std::shared_ptr<DnnlReduceMean<T>> kernel;
|
||||
kernel = std::make_shared<DnnlReduceMean<T>>(dnnl_node, params.provider, *params.attributes, os.str());
|
||||
for (auto index : dnnl_node.parent_nodes) {
|
||||
kernel->parents_.push_back(context_.kernels[index]);
|
||||
}
|
||||
context_.kernels.push_back(kernel);
|
||||
} else if (dnnl_node.name == "BatchNormalization") {
|
||||
std::ostringstream os;
|
||||
os << "BatchNormalization-" << dnnl_node.node_index << "-";
|
||||
|
|
@ -340,24 +350,35 @@ template <typename T>
|
|||
Status DnnlFuncKernel<T>::Compute(const OrtCustomOpApi* api, OrtKernelContext* context) const {
|
||||
Status status;
|
||||
try {
|
||||
// The training runner sets up the training graph then calls it via the inferance runner using a new thread
|
||||
// each call. Since the SubgraphPrimitivePool stashes the nodes based on the thread_local memory it results in a new
|
||||
// stash being created per-call from the training loop. In theory the thread_local memory should be freed when the calling
|
||||
// thread is destroyed but this was not being seen when actually running the code. Instead of relying on the thread_local
|
||||
// memory being freed we name a new SubgraphPrimitive instead of using the SubgraphPrimitivePool when the code is built for
|
||||
// training. (If the training running is updated to use a thread pool instead of a new thread each run we may be able to
|
||||
// revert back to the SubgraphPrimitivePool.)
|
||||
// The training runner sets up the training graph then calls it via the inferance runner using a new thread
|
||||
// each call. Since the SubgraphPrimitivePool stashes the nodes based on the thread_local memory it results in a new
|
||||
// stash being created per-call from the training loop. In theory the thread_local memory should be freed when the calling
|
||||
// thread is destroyed but this was not being seen when actually running the code. Instead of relying on the thread_local
|
||||
// memory being freed we name a new SubgraphPrimitive instead of using the SubgraphPrimitivePool when the code is built for
|
||||
// training. (If the training running is updated to use a thread pool instead of a new thread each run we may be able to
|
||||
// revert back to the SubgraphPrimitivePool.)
|
||||
#ifdef ENABLE_TRAINING
|
||||
std::unique_ptr<SubgraphPrimitive<T>> primitive = std::make_unique<SubgraphPrimitive<T>>(api, context, params_);
|
||||
std::unique_ptr<SubgraphPrimitive<T>> primitive = std::make_unique<SubgraphPrimitive<T>>(api, context, params_);
|
||||
primitive->UpdateProvider(params_);
|
||||
status = primitive->Compute(api, context);
|
||||
#else
|
||||
SubgraphPrimitive<T>* primitive = SubgraphPrimitivePool<T>::Get(api, context, params_);
|
||||
#endif // ENABLE_TRAINING
|
||||
|
||||
primitive->UpdateProvider(params_);
|
||||
status = primitive->Compute(api, context);
|
||||
std::string subgraph_key = params_.subgraph_key;
|
||||
if (subgraph_key.find("ReduceMean") != std::string::npos) {
|
||||
std::unique_ptr<SubgraphPrimitive<T>> primitive = std::make_unique<SubgraphPrimitive<T>>(api, context, params_);
|
||||
primitive->UpdateProvider(params_);
|
||||
status = primitive->Compute(api, context);
|
||||
}
|
||||
else
|
||||
{
|
||||
SubgraphPrimitive<T>* primitive = SubgraphPrimitivePool<T>::Get(api, context, params_);
|
||||
primitive->UpdateProvider(params_);
|
||||
status = primitive->Compute(api, context);
|
||||
}
|
||||
#endif
|
||||
} catch (const dnnl::error& e) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Status: ", e.status, ", message: ", e.what());
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Status: ", e.status, ", message: ", e.what());
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
|
|
|||
234
onnxruntime/core/providers/dnnl/subgraph/dnnl_reducemean.h
Normal file
234
onnxruntime/core/providers/dnnl/subgraph/dnnl_reducemean.h
Normal file
|
|
@ -0,0 +1,234 @@
|
|||
// Copyright(C) 2019 Intel Corporation
|
||||
// Licensed under the MIT License
|
||||
|
||||
#pragma once
|
||||
#include "core/providers/dnnl/dnnl_fwd.h"
|
||||
#include "core/providers/dnnl/dnnl_execution_provider.h"
|
||||
#include "core/providers/dnnl/subgraph/dnnl_kernel.h"
|
||||
#include <vector>
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace ort_dnnl {
|
||||
|
||||
template <typename T>
|
||||
class DnnlReduceMean : public DnnlKernel {
|
||||
public:
|
||||
DnnlReduceMean(const DnnlNode& node,
|
||||
DNNLExecutionProvider* provider,
|
||||
const NodeAttributes& attributes,
|
||||
const std::string attributes_prefix = "") : DnnlKernel(node, provider) {
|
||||
ReadAttributes(attributes, attributes_prefix);
|
||||
ORT_UNUSED_PARAMETER(attributes);
|
||||
ORT_UNUSED_PARAMETER(attributes_prefix);
|
||||
}
|
||||
|
||||
void ReadAttributes(const NodeAttributes& attributes,
|
||||
const std::string attributes_prefix = "") override {
|
||||
auto attr = attributes.find(attributes_prefix + "keepdims");
|
||||
if (attr != attributes.end() &&
|
||||
attr->second().type() == ::ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT) {
|
||||
keepdims_attr_ = attr->second().i();
|
||||
}
|
||||
auto attr2 = attributes.find(attributes_prefix + "axes");
|
||||
if (attr2 != attributes.end()) {
|
||||
auto& proto = attr2->second();
|
||||
GetIntsAttr(proto, axes_attr_);
|
||||
}
|
||||
}
|
||||
|
||||
void CreatePrimitives(const OrtCustomOpApi* api,
|
||||
OrtKernelContext* context,
|
||||
const std::unordered_map<dnnl::engine::kind, dnnl::engine>& dnnl_engine,
|
||||
std::vector<dnnl::primitive>& net,
|
||||
std::vector<std::unordered_map<int, dnnl::memory>>& net_args) {
|
||||
dnnl::engine cpu_engine;
|
||||
dnnl::engine engine_to_use;
|
||||
std::unordered_map<dnnl::engine::kind, dnnl::engine>::const_iterator iter = dnnl_engine.find(dnnl::engine::kind::cpu);
|
||||
if (iter != dnnl_engine.end()) {
|
||||
cpu_engine = (dnnl::engine)iter->second;
|
||||
engine_to_use = cpu_engine;
|
||||
}
|
||||
gpu_available_ = false;
|
||||
dnnl::engine gpu_engine;
|
||||
iter = dnnl_engine.find(dnnl::engine::kind::gpu);
|
||||
if (iter != dnnl_engine.end()) {
|
||||
gpu_engine = (dnnl::engine)(iter->second);
|
||||
gpu_available_ = true;
|
||||
engine_to_use = gpu_engine;
|
||||
LOGS_DEFAULT(INFO) << "gpu engine found" << std::endl;
|
||||
}
|
||||
Ort::CustomOpApi ort{*api};
|
||||
int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index;
|
||||
|
||||
TensorShape x_shape;
|
||||
if (mklnode_ptr_->parent_nodes.empty()) {
|
||||
const OrtValue* input_tensor = ort.KernelContext_GetInput(context, input_index);
|
||||
auto tensor_info = ort.GetTensorTypeAndShape(input_tensor);
|
||||
auto tensor_shape = ort.GetTensorShape(tensor_info);
|
||||
ort.ReleaseTensorTypeAndShapeInfo(tensor_info);
|
||||
|
||||
auto xshape = tensor_shape.data();
|
||||
auto xdim = tensor_shape.size();
|
||||
|
||||
dnnl::memory::dims dims(xdim);
|
||||
ort_source_format_ = GetSourceFormat(static_cast<int>(xdim));
|
||||
x_shape = TensorShape(xshape, xdim);
|
||||
|
||||
if (x_shape.NumDimensions() == 0) {
|
||||
primitive_created_status_ = Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Shape of size zero " + x_shape.ToString());
|
||||
return;
|
||||
}
|
||||
|
||||
dnnl::memory::dims src_dims(
|
||||
x_shape.GetDims().begin(), x_shape.GetDims().end());
|
||||
|
||||
ort_source_desc_ = dnnl::memory::desc(
|
||||
{src_dims}, DnnnType<T>(), ort_source_format_);
|
||||
source_desc_ = ort_source_desc_;
|
||||
src_md_ = std::make_unique<dnnl::memory::desc>(
|
||||
dnnl::memory::desc({src_dims}, DnnnType<T>(), ort_source_format_));
|
||||
src_mem_ = std::make_unique<dnnl::memory>(
|
||||
dnnl::memory({{src_dims}, DnnnType<T>(), ort_source_format_}, cpu_engine, nullptr));
|
||||
if (gpu_available_) {
|
||||
src_mem_gpu_ = std::make_unique<dnnl::memory>(*src_md_, gpu_engine);
|
||||
net.push_back(mkldnn::reorder(*src_mem_, *src_mem_gpu_));
|
||||
net_args.push_back({{MKLDNN_ARG_SRC, *src_mem_},
|
||||
{MKLDNN_ARG_DST, *src_mem_gpu_}});
|
||||
}
|
||||
} else {
|
||||
src_md_ = std::make_unique<dnnl::memory::desc>(
|
||||
dnnl::memory::desc(parents_[0].get()->primitive_dst_desc_));
|
||||
if (!gpu_available_) {
|
||||
src_mem_ = parents_[0].get()->primitive_dst_mem_;
|
||||
} else { // gpu_available_
|
||||
src_mem_gpu_ = parents_[0].get()->primitive_dst_mem_;
|
||||
}
|
||||
x_shape = parents_[0].get()->primitive_dst_shape_;
|
||||
ort_source_format_ = parents_[0].get()->ort_source_format_;
|
||||
ort_source_desc_ = parents_[0].get()->ort_source_desc_;
|
||||
source_desc_ = parents_[0].get()->primitive_dst_desc_;
|
||||
}
|
||||
//We need to calculate output tensor shape
|
||||
//First we initialize it with input shape and then we modify it based on the attribute values
|
||||
//This is because the attribute values decide the output shape
|
||||
|
||||
auto xshape = x_shape.GetDims();
|
||||
auto ndim = x_shape.NumDimensions();
|
||||
for (unsigned long int i = 0; i < ndim; i++) {
|
||||
if (axes_attr_.size() == 0)
|
||||
xshape[i] = 1; //If no axis is specified, then output shape is just all 1's
|
||||
else if (i < axes_attr_.size()) {
|
||||
if (axes_attr_[i] < 0)
|
||||
xshape[ndim + axes_attr_[i]] = 1;
|
||||
else
|
||||
xshape[axes_attr_[i]] = 1;
|
||||
} //If there is axis, then make the respective dimensions 1, keeping the other dimension values untouched.
|
||||
}
|
||||
primitive_dst_shape_ = TensorShape(xshape.data(), ndim);
|
||||
|
||||
dnnl::memory::dims dst_dims_mkl(primitive_dst_shape_.GetDims().begin(), primitive_dst_shape_.GetDims().end());
|
||||
|
||||
primitive_dst_md_ = std::make_unique<dnnl::memory::desc>(
|
||||
dnnl::memory::desc({ dst_dims_mkl }, DnnnType<T>(), dnnl::memory::format_tag::any));
|
||||
|
||||
// Create operation descriptor.
|
||||
std::unique_ptr<dnnl::reduction::desc> fwd_desc_ = std::make_unique<dnnl::reduction::desc>(dnnl::reduction::desc(
|
||||
dnnl::algorithm::reduction_mean, *src_md_, *primitive_dst_md_, 0.f, 0.f));
|
||||
// Create primitive descriptor.
|
||||
std::unique_ptr<dnnl::reduction::primitive_desc> reducemean_fwd_pd_ = std::make_unique<dnnl::reduction::primitive_desc>(dnnl::reduction::primitive_desc(*fwd_desc_, engine_to_use));
|
||||
// Create the primitive.
|
||||
std::unique_ptr<dnnl::primitive> reducemean_fwd_ = std::make_unique<dnnl::reduction>(dnnl::reduction(*reducemean_fwd_pd_));
|
||||
|
||||
primitive_src_desc_ = reducemean_fwd_pd_.get()->src_desc();
|
||||
primitive_dst_desc_ = reducemean_fwd_pd_.get()->dst_desc();
|
||||
|
||||
if (!gpu_available_) {
|
||||
if (mklnode_ptr_->output_index >= 0) {
|
||||
// last node of sub-graph. need to allocate memory for output_tensor
|
||||
if (primitive_dst_desc_ != ort_source_desc_) {
|
||||
// reorder neded. Use primitive output as input to reorder and
|
||||
// allocate buffer for reorder output, final output of this subgraph
|
||||
primitive_dst_mem_ = std::make_shared<dnnl::memory>(dnnl::memory(reducemean_fwd_pd_.get()->dst_desc(), cpu_engine));
|
||||
} else {
|
||||
// Last node but re-order not needed. Allocate buffer to output of this node
|
||||
primitive_dst_mem_ = std::make_shared<dnnl::memory>(dnnl::memory(reducemean_fwd_pd_.get()->dst_desc(), cpu_engine, nullptr));
|
||||
}
|
||||
} else {
|
||||
// Intermediate node. Use dnnl kernel internal memory for output and
|
||||
// use this as input to next node.
|
||||
primitive_dst_mem_ = std::make_shared<dnnl::memory>(dnnl::memory(reducemean_fwd_pd_.get()->dst_desc(), cpu_engine));
|
||||
}
|
||||
} else { // gpu_available_
|
||||
primitive_dst_mem_ = std::make_shared<dnnl::memory>(dnnl::memory(reducemean_fwd_pd_.get()->dst_desc(), gpu_engine));
|
||||
}
|
||||
|
||||
|
||||
if (!gpu_available_) {
|
||||
net.push_back(*reducemean_fwd_);
|
||||
net_args.push_back({{DNNL_ARG_SRC, *src_mem_},
|
||||
{DNNL_ARG_DST, *primitive_dst_mem_}});
|
||||
} else { // gpu_available_
|
||||
net.push_back(*reducemean_fwd_);
|
||||
net_args.push_back({{DNNL_ARG_SRC, *src_mem_gpu_},
|
||||
{DNNL_ARG_DST, *primitive_dst_mem_}});
|
||||
}
|
||||
|
||||
if (mklnode_ptr_->output_index >= 0) {
|
||||
// one of the end nodes. Allocate output buffer memory and
|
||||
// reorder is necessary
|
||||
dnnl::memory::data_type t = DnnnType<T>();
|
||||
InitDstReorderOutput(cpu_engine, t, net, net_args, gpu_available_);
|
||||
}
|
||||
}
|
||||
|
||||
Status Bind(const OrtCustomOpApi* api, OrtKernelContext* context) override {
|
||||
Ort::CustomOpApi ort{*api};
|
||||
|
||||
ORT_RETURN_IF_ERROR(primitive_created_status_);
|
||||
|
||||
int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index;
|
||||
|
||||
if (mklnode_ptr_->parent_nodes.empty()) {
|
||||
// Sub-graph's first node. Read input from input buffer
|
||||
const OrtValue* input_tensor = ort.KernelContext_GetInput(context, input_index);
|
||||
const T* src_data = const_cast<T*>(ort.GetTensorData<T>(input_tensor));
|
||||
src_mem_->set_data_handle(static_cast<void*>(const_cast<T*>(src_data)));
|
||||
}
|
||||
|
||||
if (mklnode_ptr_->output_index >= 0) {
|
||||
auto& y_dims = primitive_dst_shape_.GetDims();
|
||||
// Allocate memory for output bufffer
|
||||
OrtValue* output = ort.KernelContext_GetOutput(context, mklnode_ptr_->output_index, &y_dims[0], static_cast<int>(primitive_dst_shape_.GetDims().size()));
|
||||
T* dst_data = ort.GetTensorMutableData<T>(output);
|
||||
|
||||
if (!gpu_available_) {
|
||||
if (primitive_dst_desc_ != ort_source_desc_) {
|
||||
reorder_dst_mem_to_->set_data_handle(dst_data);
|
||||
} else {
|
||||
primitive_dst_mem_->set_data_handle(dst_data);
|
||||
}
|
||||
} else { // gpu_available_
|
||||
reorder_dst_mem_to_->set_data_handle(dst_data);
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<dnnl::memory> src_mem_;
|
||||
std::shared_ptr<dnnl::memory> src_mem_gpu_;
|
||||
|
||||
//std::unique_ptr<dnnl::reduction::desc> fwd_desc_;
|
||||
//std::unique_ptr<dnnl::reduction::primitive_desc> reducemean_fwd_pd_;
|
||||
//std::unique_ptr<dnnl::primitive> reducemean_fwd_;
|
||||
|
||||
std::unique_ptr<dnnl::memory::desc> src_md_;
|
||||
|
||||
int64_t keepdims_attr_;
|
||||
std::vector<int64_t> axes_attr_;
|
||||
|
||||
bool gpu_available_;
|
||||
};
|
||||
} // namespace ort_dnnl
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -1030,7 +1030,7 @@ TEST(ReductionOpTest, ReduceMean) {
|
|||
9.0f, 10.0f,
|
||||
11.0f, 12.0f});
|
||||
test.AddOutput<float>("reduced", {1, 2, 1}, {5.5f, 7.5f});
|
||||
|
||||
|
||||
test.Run();
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue