From 3cd06cb38cd5e76619dbd7a2859897ab9cdd2d8c Mon Sep 17 00:00:00 2001
From: chethanpk <63478277+chethanpk@users.noreply.github.com>
Date: Mon, 21 Jun 2021 17:15:46 -0700
Subject: [PATCH] Added support for ReduceMean on DNNL EP for CPU and GPU
 (#7902)

* Added support for ReduceMean on DNNL EP for CPU and GPU

Signed-off-by: Chethan Palangotu Keshava <chethan.palangotu.keshava@intel.com>

* Added fix for a resnet model failure where it was failing to create dst shape for reducemean when it was part of a subgraph with other ops

Signed-off-by: Chethan Palangotu Keshava <chethan.palangotu.keshava@intel.com>

* Removing the DNNL EP from these unit tests. This is in anticipation of two changes:
- DNNL EP unit tests would be added in a different location later on, so addition of EP individually to these tests will not be necessary
- This was causing a memory leak fail in debug build. The bug is in the EP itself and not in the code added for reducemean. The fix for this is in the i/o handling overhaul which will be added later.

* Update reduction_ops_test.cc

Had accidentally deleted a new line. Making sure there are no unnecessary changes in this file
---
 .../core/providers/dnnl/dnnl_op_manager.cc    |   1 +
 .../dnnl/subgraph/dnnl_func_kernel.cc         |  49 ++--
 .../providers/dnnl/subgraph/dnnl_reducemean.h | 234 ++++++++++++++++++
 .../cpu/reduction/reduction_ops_test.cc       |   2 +-
 4 files changed, 271 insertions(+), 15 deletions(-)
 create mode 100644 onnxruntime/core/providers/dnnl/subgraph/dnnl_reducemean.h
diff --git a/onnxruntime/core/providers/dnnl/dnnl_op_manager.cc b/onnxruntime/core/providers/dnnl/dnnl_op_manager.cc
index 0aac1c8c86..24bb06a69b 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_op_manager.cc
+++ b/onnxruntime/core/providers/dnnl/dnnl_op_manager.cc
@@ -13,6 +13,7 @@ DnnlOpManager::DnnlOpManager() {
   dnnl_ops_map_.emplace(std::make_pair("LRN", std::unique_ptr<DnnlNodeCapability>(new DnnlDefaultNodeCapability())));
   dnnl_ops_map_.emplace(std::make_pair("MatMul", std::unique_ptr<DnnlNodeCapability>(new DnnlMatMulNodeCapability())));
   dnnl_ops_map_.emplace(std::make_pair("MaxPool", std::unique_ptr<DnnlNodeCapability>(new DnnlPoolNodeCapability())));
+  dnnl_ops_map_.emplace(std::make_pair("ReduceMean", std::unique_ptr<DnnlNodeCapability>(new DnnlReduceMeanNodeCapability())));
   dnnl_ops_map_.emplace(std::make_pair("Relu", std::unique_ptr<DnnlNodeCapability>(new DnnlDefaultNodeCapability())));
   dnnl_ops_map_.emplace(std::make_pair("Sum", std::unique_ptr<DnnlNodeCapability>(new DnnlDefaultNodeCapability())));
 #if defined(ENABLE_TRAINING)
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_func_kernel.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_func_kernel.cc
index 5e330725db..f6c4da8e3a 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_func_kernel.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_func_kernel.cc
@@ -15,6 +15,7 @@
 #include "core/providers/dnnl/subgraph/dnnl_pool.h"
 #include "core/providers/dnnl/subgraph/dnnl_sum.h"
 #include "core/providers/dnnl/subgraph/dnnl_lrn.h"
+#include "core/providers/dnnl/subgraph/dnnl_reducemean.h"
 #include "core/providers/dnnl/subgraph/dnnl_matmul.h"
 #ifdef ENABLE_TRAINING
 #include "core/providers/dnnl/subgraph/dnnl_convgrad.h"
@@ -108,6 +109,15 @@ class SubgraphPrimitive : public PrimitiveBase {
           kernel->parents_.push_back(context_.kernels[index]);
         }
         context_.kernels.push_back(kernel);
+      } else if (dnnl_node.name == "ReduceMean") {
+        std::ostringstream os;
+        os << "ReduceMean-" << dnnl_node.node_index << "-";
+        std::shared_ptr<DnnlReduceMean<T>> kernel;
+        kernel = std::make_shared<DnnlReduceMean<T>>(dnnl_node, params.provider, *params.attributes, os.str());
+        for (auto index : dnnl_node.parent_nodes) {
+            kernel->parents_.push_back(context_.kernels[index]);
+        }
+        context_.kernels.push_back(kernel);
       } else if (dnnl_node.name == "BatchNormalization") {
         std::ostringstream os;
         os << "BatchNormalization-" << dnnl_node.node_index << "-";
@@ -340,24 +350,35 @@ template <typename T>
 Status DnnlFuncKernel<T>::Compute(const OrtCustomOpApi* api, OrtKernelContext* context) const {
   Status status;
   try {
-    // The training runner sets up the training graph then calls it via the inferance runner using a new thread
-    // each call. Since the SubgraphPrimitivePool stashes the nodes based on the thread_local memory it results in a new
-    // stash being created per-call from the training loop.  In theory the thread_local memory should be freed when the calling
-    // thread is destroyed but this was not being seen when actually running the code.  Instead of relying on the thread_local
-    // memory being freed we name a new SubgraphPrimitive instead of using the SubgraphPrimitivePool when the code is built for
-    // training. (If the training running is updated to use a thread pool instead of a new thread each run we may be able to
-    // revert back to the SubgraphPrimitivePool.)
+      // The training runner sets up the training graph then calls it via the inferance runner using a new thread
+      // each call. Since the SubgraphPrimitivePool stashes the nodes based on the thread_local memory it results in a new
+      // stash being created per-call from the training loop.  In theory the thread_local memory should be freed when the calling
+      // thread is destroyed but this was not being seen when actually running the code.  Instead of relying on the thread_local
+      // memory being freed we name a new SubgraphPrimitive instead of using the SubgraphPrimitivePool when the code is built for
+      // training. (If the training running is updated to use a thread pool instead of a new thread each run we may be able to
+      // revert back to the SubgraphPrimitivePool.)
 #ifdef ENABLE_TRAINING
-    std::unique_ptr<SubgraphPrimitive<T>> primitive = std::make_unique<SubgraphPrimitive<T>>(api, context, params_);
+      std::unique_ptr<SubgraphPrimitive<T>> primitive = std::make_unique<SubgraphPrimitive<T>>(api, context, params_);
+      primitive->UpdateProvider(params_);
+      status = primitive->Compute(api, context);
 #else
-    SubgraphPrimitive<T>* primitive = SubgraphPrimitivePool<T>::Get(api, context, params_);
-#endif  // ENABLE_TRAINING
-
-    primitive->UpdateProvider(params_);
-    status = primitive->Compute(api, context);
+      std::string subgraph_key = params_.subgraph_key;
+      if (subgraph_key.find("ReduceMean") != std::string::npos) {
+          std::unique_ptr<SubgraphPrimitive<T>> primitive = std::make_unique<SubgraphPrimitive<T>>(api, context, params_);
+          primitive->UpdateProvider(params_);
+          status = primitive->Compute(api, context);
+      }
+      else
+      {
+          SubgraphPrimitive<T>* primitive = SubgraphPrimitivePool<T>::Get(api, context, params_);
+          primitive->UpdateProvider(params_);
+          status = primitive->Compute(api, context);
+      }
+#endif
   } catch (const dnnl::error& e) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Status: ", e.status, ", message: ", e.what());
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Status: ", e.status, ", message: ", e.what());
   }
+
   return status;
 }
 
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_reducemean.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_reducemean.h
new file mode 100644
index 0000000000..d3026c38d9
--- /dev/null
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_reducemean.h
@@ -0,0 +1,234 @@
+// Copyright(C) 2019 Intel Corporation
+// Licensed under the MIT License
+
+#pragma once
+#include "core/providers/dnnl/dnnl_fwd.h"
+#include "core/providers/dnnl/dnnl_execution_provider.h"
+#include "core/providers/dnnl/subgraph/dnnl_kernel.h"
+#include <vector>
+
+namespace onnxruntime {
+namespace ort_dnnl {
+
+template <typename T>
+class DnnlReduceMean : public DnnlKernel {
+ public:
+  DnnlReduceMean(const DnnlNode& node,
+           DNNLExecutionProvider* provider,
+           const NodeAttributes& attributes,
+           const std::string attributes_prefix = "") : DnnlKernel(node, provider) {
+	ReadAttributes(attributes, attributes_prefix);
+    ORT_UNUSED_PARAMETER(attributes);
+    ORT_UNUSED_PARAMETER(attributes_prefix);
+  }
+
+  void ReadAttributes(const NodeAttributes& attributes,
+	  const std::string attributes_prefix = "") override {
+	  auto attr = attributes.find(attributes_prefix + "keepdims");
+	  if (attr != attributes.end() &&
+		  attr->second().type() == ::ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT) {
+		  keepdims_attr_ = attr->second().i();
+	  }
+	  auto attr2 = attributes.find(attributes_prefix + "axes");
+	  if (attr2 != attributes.end()) {
+		  auto& proto = attr2->second();
+		  GetIntsAttr(proto, axes_attr_);
+	  }
+  }
+
+  void CreatePrimitives(const OrtCustomOpApi* api,
+                        OrtKernelContext* context,
+                        const std::unordered_map<dnnl::engine::kind, dnnl::engine>& dnnl_engine,
+                        std::vector<dnnl::primitive>& net,
+                        std::vector<std::unordered_map<int, dnnl::memory>>& net_args) {
+    dnnl::engine cpu_engine;
+    dnnl::engine engine_to_use;
+    std::unordered_map<dnnl::engine::kind, dnnl::engine>::const_iterator iter = dnnl_engine.find(dnnl::engine::kind::cpu);
+    if (iter != dnnl_engine.end()) {
+      cpu_engine = (dnnl::engine)iter->second;
+      engine_to_use = cpu_engine;
+    }
+    gpu_available_ = false;
+    dnnl::engine gpu_engine;
+    iter = dnnl_engine.find(dnnl::engine::kind::gpu);
+    if (iter != dnnl_engine.end()) {
+      gpu_engine = (dnnl::engine)(iter->second);
+      gpu_available_ = true;
+      engine_to_use = gpu_engine;
+      LOGS_DEFAULT(INFO) << "gpu engine found" << std::endl;
+    }
+    Ort::CustomOpApi ort{*api};
+    int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index;
+
+    TensorShape x_shape;
+    if (mklnode_ptr_->parent_nodes.empty()) {
+      const OrtValue* input_tensor = ort.KernelContext_GetInput(context, input_index);
+      auto tensor_info = ort.GetTensorTypeAndShape(input_tensor);
+      auto tensor_shape = ort.GetTensorShape(tensor_info);
+      ort.ReleaseTensorTypeAndShapeInfo(tensor_info);
+
+      auto xshape = tensor_shape.data();
+      auto xdim = tensor_shape.size();
+
+      dnnl::memory::dims dims(xdim);
+      ort_source_format_ = GetSourceFormat(static_cast<int>(xdim));
+      x_shape = TensorShape(xshape, xdim);
+
+      if (x_shape.NumDimensions() == 0) {
+        primitive_created_status_ = Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Shape of size zero " + x_shape.ToString());
+        return;
+      }
+
+      dnnl::memory::dims src_dims(
+          x_shape.GetDims().begin(), x_shape.GetDims().end());
+
+      ort_source_desc_ = dnnl::memory::desc(
+          {src_dims}, DnnnType<T>(), ort_source_format_);
+      source_desc_ = ort_source_desc_;
+      src_md_ = std::make_unique<dnnl::memory::desc>(
+          dnnl::memory::desc({src_dims}, DnnnType<T>(), ort_source_format_));
+      src_mem_ = std::make_unique<dnnl::memory>(
+          dnnl::memory({{src_dims}, DnnnType<T>(), ort_source_format_}, cpu_engine, nullptr));
+      if (gpu_available_) {
+        src_mem_gpu_ = std::make_unique<dnnl::memory>(*src_md_, gpu_engine);
+        net.push_back(mkldnn::reorder(*src_mem_, *src_mem_gpu_));
+        net_args.push_back({{MKLDNN_ARG_SRC, *src_mem_},
+                            {MKLDNN_ARG_DST, *src_mem_gpu_}});
+      }
+    } else {
+      src_md_ = std::make_unique<dnnl::memory::desc>(
+          dnnl::memory::desc(parents_[0].get()->primitive_dst_desc_));
+      if (!gpu_available_) {
+        src_mem_ = parents_[0].get()->primitive_dst_mem_;
+      } else { // gpu_available_
+        src_mem_gpu_ = parents_[0].get()->primitive_dst_mem_;
+      }
+      x_shape = parents_[0].get()->primitive_dst_shape_;
+      ort_source_format_ = parents_[0].get()->ort_source_format_;
+      ort_source_desc_ = parents_[0].get()->ort_source_desc_;
+      source_desc_ = parents_[0].get()->primitive_dst_desc_;
+    }
+    //We need to calculate output tensor shape
+    //First we initialize it with input shape and then we modify it based on the attribute values
+    //This is because the attribute values decide the output shape
+
+    auto xshape = x_shape.GetDims();
+    auto ndim = x_shape.NumDimensions();
+    for (unsigned long int i = 0; i < ndim; i++) {
+      if (axes_attr_.size() == 0)
+        xshape[i] = 1;  //If no axis is specified, then output shape is just all 1's
+      else if (i < axes_attr_.size()) {
+        if (axes_attr_[i] < 0)
+          xshape[ndim + axes_attr_[i]] = 1;
+        else
+          xshape[axes_attr_[i]] = 1;
+      }  //If there is axis, then make the respective dimensions 1, keeping the other dimension values untouched.
+    }
+    primitive_dst_shape_ = TensorShape(xshape.data(), ndim);
+
+    dnnl::memory::dims dst_dims_mkl(primitive_dst_shape_.GetDims().begin(), primitive_dst_shape_.GetDims().end());
+
+    primitive_dst_md_ = std::make_unique<dnnl::memory::desc>(
+        dnnl::memory::desc({ dst_dims_mkl }, DnnnType<T>(), dnnl::memory::format_tag::any));
+
+    // Create operation descriptor.
+    std::unique_ptr<dnnl::reduction::desc> fwd_desc_ = std::make_unique<dnnl::reduction::desc>(dnnl::reduction::desc(
+        dnnl::algorithm::reduction_mean, *src_md_, *primitive_dst_md_, 0.f, 0.f));
+    // Create primitive descriptor.
+    std::unique_ptr<dnnl::reduction::primitive_desc> reducemean_fwd_pd_ = std::make_unique<dnnl::reduction::primitive_desc>(dnnl::reduction::primitive_desc(*fwd_desc_, engine_to_use));
+    // Create the primitive.
+    std::unique_ptr<dnnl::primitive> reducemean_fwd_ = std::make_unique<dnnl::reduction>(dnnl::reduction(*reducemean_fwd_pd_));
+
+    primitive_src_desc_ = reducemean_fwd_pd_.get()->src_desc();
+    primitive_dst_desc_ = reducemean_fwd_pd_.get()->dst_desc();
+
+    if (!gpu_available_) {
+      if (mklnode_ptr_->output_index >= 0) {
+        // last node of sub-graph. need to allocate memory for output_tensor
+        if (primitive_dst_desc_ != ort_source_desc_) {
+          // reorder neded. Use primitive output as input to reorder and
+          // allocate buffer for reorder output, final output of this subgraph
+          primitive_dst_mem_ = std::make_shared<dnnl::memory>(dnnl::memory(reducemean_fwd_pd_.get()->dst_desc(), cpu_engine));
+        } else {
+          // Last node but re-order not needed. Allocate buffer to output of this node
+          primitive_dst_mem_ = std::make_shared<dnnl::memory>(dnnl::memory(reducemean_fwd_pd_.get()->dst_desc(), cpu_engine, nullptr));
+        }
+      } else {
+        // Intermediate node. Use dnnl kernel internal memory for output and
+        // use this as input to next node.
+        primitive_dst_mem_ = std::make_shared<dnnl::memory>(dnnl::memory(reducemean_fwd_pd_.get()->dst_desc(), cpu_engine));
+      }
+    } else { // gpu_available_
+      primitive_dst_mem_ = std::make_shared<dnnl::memory>(dnnl::memory(reducemean_fwd_pd_.get()->dst_desc(), gpu_engine));
+    }
+
+
+    if (!gpu_available_) {
+      net.push_back(*reducemean_fwd_);
+      net_args.push_back({{DNNL_ARG_SRC, *src_mem_},
+                          {DNNL_ARG_DST, *primitive_dst_mem_}});
+    } else { // gpu_available_
+      net.push_back(*reducemean_fwd_);
+      net_args.push_back({{DNNL_ARG_SRC, *src_mem_gpu_},
+                          {DNNL_ARG_DST, *primitive_dst_mem_}});
+    }
+
+    if (mklnode_ptr_->output_index >= 0) {
+      // one of the end nodes. Allocate output buffer memory and
+      // reorder is necessary
+      dnnl::memory::data_type t = DnnnType<T>();
+      InitDstReorderOutput(cpu_engine, t, net, net_args, gpu_available_);
+    }
+  }
+
+  Status Bind(const OrtCustomOpApi* api, OrtKernelContext* context) override {
+    Ort::CustomOpApi ort{*api};
+
+    ORT_RETURN_IF_ERROR(primitive_created_status_);
+
+    int input_index = mklnode_ptr_->input_start_index < 0 ? 0 : mklnode_ptr_->input_start_index;
+
+    if (mklnode_ptr_->parent_nodes.empty()) {
+      // Sub-graph's first node. Read input from input buffer
+      const OrtValue* input_tensor = ort.KernelContext_GetInput(context, input_index);
+      const T* src_data = const_cast<T*>(ort.GetTensorData<T>(input_tensor));
+      src_mem_->set_data_handle(static_cast<void*>(const_cast<T*>(src_data)));
+    }
+
+    if (mklnode_ptr_->output_index >= 0) {
+      auto& y_dims = primitive_dst_shape_.GetDims();
+      // Allocate memory for output bufffer
+      OrtValue* output = ort.KernelContext_GetOutput(context, mklnode_ptr_->output_index, &y_dims[0], static_cast<int>(primitive_dst_shape_.GetDims().size()));
+      T* dst_data = ort.GetTensorMutableData<T>(output);
+
+      if (!gpu_available_) {
+        if (primitive_dst_desc_ != ort_source_desc_) {
+          reorder_dst_mem_to_->set_data_handle(dst_data);
+        } else {
+          primitive_dst_mem_->set_data_handle(dst_data);
+        }
+      } else { // gpu_available_
+        reorder_dst_mem_to_->set_data_handle(dst_data);
+      }
+    }
+
+    return Status::OK();
+  }
+
+ private:
+  std::shared_ptr<dnnl::memory> src_mem_;
+  std::shared_ptr<dnnl::memory> src_mem_gpu_;
+
+  //std::unique_ptr<dnnl::reduction::desc> fwd_desc_;
+  //std::unique_ptr<dnnl::reduction::primitive_desc> reducemean_fwd_pd_;
+  //std::unique_ptr<dnnl::primitive> reducemean_fwd_;
+
+  std::unique_ptr<dnnl::memory::desc> src_md_;
+
+  int64_t keepdims_attr_;
+  std::vector<int64_t> axes_attr_;
+
+  bool gpu_available_;
+};
+}  // namespace ort_dnnl
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index 01b0671261..ed896269fd 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -1030,7 +1030,7 @@ TEST(ReductionOpTest, ReduceMean) {
                         9.0f, 10.0f,
                         11.0f, 12.0f});
   test.AddOutput<float>("reduced", {1, 2, 1}, {5.5f, 7.5f});
-
+  
   test.Run();
 }