From f3c74ec3e9ee7e80daa3ccc7e95c3249844e7e41 Mon Sep 17 00:00:00 2001
From: Sreekanth Yalachigere
 <17345104+sreekanth-yalachigere@users.noreply.github.com>
Date: Thu, 18 Jul 2019 22:57:00 -0700
Subject: [PATCH] Reduce memory footprint of MKL-DNN EP (#1429)

* MKL-DNN EP memory fix patch

* Call default provider for Opset10

* opset 10 fix

* removed email header from patch

* UseSubgraph method refactored
---
 cmake/external/mkldnn.cmake                   |   2 +
 cmake/patches/mkldnn/mem-patch.cmake.patch    | 107 ++++++++++++++++++
 .../mkldnn/mkldnn_execution_provider.cc       |  36 ++++--
 .../mkldnn/mkldnn_execution_provider.h        |   3 +-
 .../providers/mkldnn/subgraph/mkldnn_conv.h   |   4 +-
 5 files changed, 139 insertions(+), 13 deletions(-)
 create mode 100644 cmake/patches/mkldnn/mem-patch.cmake.patch

diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index ed05d885a6..796541f096 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -61,6 +61,7 @@ if (onnxruntime_USE_MKLDNN)
   set(MKLDNN_INCLUDE_DIR ${MKLDNN_INSTALL}/include)
   if(NOT onnxruntime_BUILD_FOR_NATIVE_MACHINE)
     set(MKLDNN_PATCH_COMMAND1 git apply ${CMAKE_SOURCE_DIR}/patches/mkldnn/platform.cmake.patch)
+    set(MKLDNN_PATCH_COMMAND2 git apply ${CMAKE_SOURCE_DIR}/patches/mkldnn/mem-patch.cmake.patch)
     # discard prior changes due to patching in mkldnn source to unblock incremental builds.
     set(MKLDNN_PATCH_DISCARD_COMMAND cd ${MKLDNN_SOURCE} && git checkout -- .)
   endif()
@@ -69,6 +70,7 @@ if (onnxruntime_USE_MKLDNN)
     GIT_REPOSITORY ${MKLDNN_URL}
     GIT_TAG ${MKLDNN_TAG}
     PATCH_COMMAND ${MKLDNN_PATCH_DISCARD_COMMAND} COMMAND ${MKLDNN_PATCH_COMMAND1}
+    COMMAND ${MKLDNN_PATCH_COMMAND2}
     SOURCE_DIR ${MKLDNN_SOURCE}
     CMAKE_ARGS -DMKLDNN_PRODUCT_BUILD_MODE=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL} -DMKLROOT=${MKML_DIR}
   )
diff --git a/cmake/patches/mkldnn/mem-patch.cmake.patch b/cmake/patches/mkldnn/mem-patch.cmake.patch
new file mode 100644
index 0000000000..9d0a2543e6
--- /dev/null
+++ b/cmake/patches/mkldnn/mem-patch.cmake.patch
@@ -0,0 +1,107 @@
+
+---
+ src/cpu/jit_avx2_1x1_convolution.cpp                 | 6 +++---
+ src/cpu/jit_avx512_common_1x1_convolution.cpp        | 9 ++++-----
+ src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp | 6 ++++--
+ src/cpu/jit_uni_1x1_conv_utils.hpp                   | 3 ++-
+ 4 files changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/src/cpu/jit_avx2_1x1_convolution.cpp b/src/cpu/jit_avx2_1x1_convolution.cpp
+index 46362886..edb2b6fb 100644
+--- a/src/cpu/jit_avx2_1x1_convolution.cpp
++++ b/src/cpu/jit_avx2_1x1_convolution.cpp
+@@ -50,7 +50,7 @@ void jit_avx2_1x1_convolution_fwd_t::execute_forward() const {
+     const memory_desc_wrapper weights_d(pd()->weights_pd(0));
+ 
+     const auto &jcp = kernel_->jcp;
+-    auto rtus_space = scratchpad().get<data_t>(key_conv_rtus_space);
++    auto rtus_space = pd()->rtus_.reduce_src_?scratchpad().get<data_t>(key_conv_rtus_space):NULL;
+ 
+     const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast;
+     const int ndims = dst_d.ndims();
+@@ -180,7 +180,7 @@ void jit_avx2_1x1_convolution_bwd_data_t::execute_backward_data() const {
+     const memory_desc_wrapper diff_src_d(pd()->diff_src_pd());
+ 
+     const auto &jcp = kernel_->jcp;
+-    auto rtus_space = scratchpad().get<data_t>(key_conv_rtus_space);
++    auto rtus_space = pd()->rtus_.reduce_src_?scratchpad().get<data_t>(key_conv_rtus_space):NULL;
+ 
+     // TODO (Roma): remove this restriction
+     assert(jcp.stride_w == 1 && jcp.stride_h == 1);
+@@ -306,7 +306,7 @@ void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() const {
+     const memory_desc_wrapper diff_bias_d(pd()->diff_weights_pd(1));
+ 
+     const auto &jcp = kernel_->jcp;
+-    auto rtus_space = scratchpad.get<data_t>(key_conv_rtus_space);
++    auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get<data_t>(key_conv_rtus_space):NULL;
+ 
+     data_t *diff_bias = pd()->wants_padded_bias()
+         ? scratchpad.get<data_t>(key_conv_padded_bias) : diff_bias_in;
+diff --git a/src/cpu/jit_avx512_common_1x1_convolution.cpp b/src/cpu/jit_avx512_common_1x1_convolution.cpp
+index 6879cd91..6a32aa49 100644
+--- a/src/cpu/jit_avx512_common_1x1_convolution.cpp
++++ b/src/cpu/jit_avx512_common_1x1_convolution.cpp
+@@ -106,7 +106,7 @@ execute_forward_thr(const int ithr, const int nthr, const src_data_t *src,
+     const memory_desc_wrapper weights_d(pd()->weights_pd(0));
+ 
+     const auto &jcp = kernel_->jcp;
+-    auto rtus_space = scratchpad.get<src_data_t>(key_conv_rtus_space);
++    auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get<src_data_t>(key_conv_rtus_space):NULL;
+ 
+     const int ndims = src_d.ndims();
+     const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[0];
+@@ -301,9 +301,8 @@ void jit_avx512_common_1x1_convolution_bwd_data_t<diff_dst_type, wei_type,
+     const memory_desc_wrapper diff_src_d(pd()->diff_src_pd());
+ 
+     const auto &jcp = kernel_->jcp;
+-    auto rtus_space = scratchpad().template get<diff_src_data_t>(
+-            key_conv_rtus_space);
+-
++    auto rtus_space = pd()->rtus_.reduce_src_? scratchpad().template get<diff_src_data_t>(key_conv_rtus_space): NULL;
++	
+     const int ndims = diff_src_d.ndims();
+ 
+     // TODO (Roma): remove this restriction
+@@ -470,7 +469,7 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights()
+ 
+     const auto scratchpad = this->scratchpad();
+ 
+-    auto rtus_space = scratchpad.get<data_t>(key_conv_rtus_space);
++    auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get<data_t>(key_conv_rtus_space):NULL;
+     data_t *diff_bias = pd()->wants_padded_bias()
+         ? scratchpad.get<data_t>(key_conv_padded_bias) : diff_bias_in;
+     auto wei_reduction = scratchpad.get<data_t>(key_conv_wei_reduction);
+diff --git a/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp b/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp
+index de303cd2..ec0c54e7 100644
+--- a/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp
++++ b/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp
+@@ -100,8 +100,10 @@ void jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<src_type, dst_type>
+         ? types::data_type_size(pd()->desc()->bias_desc.data_type) : 0;
+ 
+     const auto &jcp = kernel_->jcp;
+-    auto rtus_space = scratchpad.get<src_data_t>(key_conv_rtus_space);
+-    auto local_scales = scratchpad.get<float>(key_conv_adjusted_scales);
++	
++    auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get<src_data_t>(key_conv_rtus_space):NULL;
++    
++	auto local_scales = scratchpad.get<float>(key_conv_adjusted_scales);
+ 
+     const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast;
+ 
+diff --git a/src/cpu/jit_uni_1x1_conv_utils.hpp b/src/cpu/jit_uni_1x1_conv_utils.hpp
+index a3ed769a..5a0e0635 100644
+--- a/src/cpu/jit_uni_1x1_conv_utils.hpp
++++ b/src/cpu/jit_uni_1x1_conv_utils.hpp
+@@ -94,7 +94,8 @@ inline void rtus_prepare(conv_pd_t *self, const convolution_desc_t *&conv_d,
+ template <typename conv_pd_t>
+ inline void rtus_prepare_space_info(conv_pd_t *self,
+         memory_tracking::registrar_t &scratchpad) {
+-    const auto &jcp = self->jcp_;
++    if (!self->rtus_.reduce_src_) return;
++	const auto &jcp = self->jcp_;
+ 
+     const int max_threads = mkldnn_get_max_threads();
+     const size_t factor = utils::pick_by_prop_kind(self->desc()->prop_kind,
+-- 
+2.17.0.windows.1
+
diff --git a/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.cc b/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.cc
index a9ea7ca581..501b4efab3 100644
--- a/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.cc
+++ b/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.cc
@@ -112,13 +112,14 @@ std::shared_ptr<KernelRegistry> MKLDNNExecutionProvider::GetKernelRegistry() con
 }
 
 bool MKLDNNExecutionProvider::UseSubgraph(const onnxruntime::GraphViewer& graph_viewer,
-                                          const std::vector<const KernelRegistry*>& kernel_registries,
-                                          std::vector<std::unique_ptr<ComputeCapability>>& result) const {
+                                          const std::vector<const KernelRegistry*>& kernel_registries) const {
   // switch between mkldnn-vanilla and mkldnn-subgraph implementation using
   // MKLDNN_SUBGRAPH environment variable
   bool use_subgraph = true;
 
   bool FP16_graph = false;
+  bool mkldnn_nodes_in_the_graph = false;
+
   if (graph_viewer.MaxNodeIndex() > 0) {
     int index = 0;
     auto node = graph_viewer.GetNode(index);
@@ -130,16 +131,27 @@ bool MKLDNNExecutionProvider::UseSubgraph(const onnxruntime::GraphViewer& graph_
       FP16_graph = node->InputDefs()[0]->Type()->find("16") != std::string::npos;
   }
 
-  if (FP16_graph) {
+  for (auto node_index = 0; node_index < graph_viewer.MaxNodeIndex(); node_index++) {
+    auto node = graph_viewer.GetNode(node_index);
+    if (node == nullptr) {
+      node_index++;
+      continue;
+    }
+    auto op_it = mkldnn_ops_.find(node->OpType());
+    if (op_it != mkldnn_ops_.end()) {
+      mkldnn_nodes_in_the_graph = true;
+      break;
+    }
+  }
+
+  if (FP16_graph || !mkldnn_nodes_in_the_graph) {
     // FP16 not supported yet.
     use_subgraph = false;
-    result = IExecutionProvider::GetCapability(graph_viewer, kernel_registries);
   } else {
     const char* env = getenv("ORT_MKLDNN_SUBGRAPH");
     if (env != nullptr) {
       if (atoi(env) == 0) {
         use_subgraph = false;
-        result = IExecutionProvider::GetCapability(graph_viewer, kernel_registries);
       }
     }
   }
@@ -209,16 +221,16 @@ std::vector<std::unique_ptr<ComputeCapability>> MKLDNNExecutionProvider::GetCapa
     const onnxruntime::GraphViewer& graph_viewer,
     const std::vector<const KernelRegistry*>& kernel_registries) const {
   ORT_UNUSED_PARAMETER(kernel_registries);
-  std::vector<std::unique_ptr<ComputeCapability>> result;
 
   // temporary switch to toggle between mkldnn-vanilla and mkldnn-subgraph implementation using
   // ORT_MKLDNN_SUBGRAPH environment variable
-  if (UseSubgraph(graph_viewer, kernel_registries, result) == false) {
-    return result;
+  if (UseSubgraph(graph_viewer, kernel_registries) == false) {
+    return IExecutionProvider::GetCapability(graph_viewer, kernel_registries);
   }
 
   LOGS_DEFAULT(INFO) << "Using MKL-DNN Subgraph";
   // use sub-graph implementation
+  std::vector<std::unique_ptr<ComputeCapability>> result;
   mkl_dnn::Subgraph::SubgraphVariables sub_var;
   std::shared_ptr<mkl_dnn::Subgraph> subgraph_ptr;
 
@@ -243,6 +255,12 @@ std::vector<std::unique_ptr<ComputeCapability>> MKLDNNExecutionProvider::GetCapa
 
     if (IsDimensionSupported(node) == false) {
       node_index++;
+      if (subgraph_ptr->mkldnn_nodes.size() > 0) {
+        CreateMetaDef(graph_viewer, subgraph_attributes, subgraph_ptr, sub_var, result);
+        subgraph_ptr.reset(new mkl_dnn::Subgraph(graph_name));
+        subgraph_attributes.clear();
+        output_to_source_node_map.clear();
+      }
       continue;
     }
 
@@ -436,7 +454,7 @@ Status MKLDNNExecutionProvider::Compile(const std::vector<onnxruntime::Node*>& f
 
     compute_info.compute_func = [](FunctionState state, const OrtCustomOpApi* api, OrtKernelContext* context) {
       onnxruntime::mkl_dnn::MkldnnFuncKernel<float>* custom_op = reinterpret_cast<mkl_dnn::MkldnnFuncKernel<float>*>(state);
-      return  custom_op->Compute(api, context);
+      return custom_op->Compute(api, context);
     };
 
     node_compute_funcs.push_back(compute_info);
diff --git a/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.h b/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.h
index 3e173e7a79..53dcdf45c6 100644
--- a/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.h
+++ b/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.h
@@ -99,8 +99,7 @@ class MKLDNNExecutionProvider : public IExecutionProvider {
   }
 
   bool UseSubgraph(const onnxruntime::GraphViewer& graph_viewer,
-                   const std::vector<const KernelRegistry*>& kernel_registries,
-                   std::vector<std::unique_ptr<ComputeCapability>>& result) const;
+                   const std::vector<const KernelRegistry*>& kernel_registries) const;
 
   // Some dimensions are not supported by MKL-DNN
   // example: Pool with NumDimensions <= 3 is not supported
diff --git a/onnxruntime/core/providers/mkldnn/subgraph/mkldnn_conv.h b/onnxruntime/core/providers/mkldnn/subgraph/mkldnn_conv.h
index ceafa69e5f..9e421ec365 100644
--- a/onnxruntime/core/providers/mkldnn/subgraph/mkldnn_conv.h
+++ b/onnxruntime/core/providers/mkldnn/subgraph/mkldnn_conv.h
@@ -238,13 +238,13 @@ class MklDnnConv : public MklDnnKernel {
 
     if (!bias_dims_mkl.empty()) {
       fwd_desc_.reset(new mkldnn::convolution_forward::desc(
-          mkldnn::prop_kind::forward, mkldnn::convolution_direct, *src_md_,
+          mkldnn::prop_kind::forward_inference, mkldnn::convolution_direct, *src_md_,
           *filter_md_, *bias_md_, *primitive_dst_md_,
           strides_mkl, dilations_mkl, padding_left_mkl,
           padding_right_mkl, mkldnn::padding_kind::zero));
     } else {
       fwd_desc_.reset(new mkldnn::convolution_forward::desc(
-          mkldnn::prop_kind::forward, mkldnn::convolution_direct, *src_md_,
+          mkldnn::prop_kind::forward_inference, mkldnn::convolution_direct, *src_md_,
           *filter_md_, *primitive_dst_md_, strides_mkl,
           dilations_mkl, padding_left_mkl,
           padding_right_mkl, mkldnn::padding_kind::zero));