From f3c74ec3e9ee7e80daa3ccc7e95c3249844e7e41 Mon Sep 17 00:00:00 2001 From: Sreekanth Yalachigere <17345104+sreekanth-yalachigere@users.noreply.github.com> Date: Thu, 18 Jul 2019 22:57:00 -0700 Subject: [PATCH] Reduce memory footprint of MKL-DNN EP (#1429) * MKL-DNN EP memory fix patch * Call default provider for Opset10 * opset 10 fix * removed email header from patch * UseSubgraph method refactored --- cmake/external/mkldnn.cmake | 2 + cmake/patches/mkldnn/mem-patch.cmake.patch | 107 ++++++++++++++++++ .../mkldnn/mkldnn_execution_provider.cc | 36 ++++-- .../mkldnn/mkldnn_execution_provider.h | 3 +- .../providers/mkldnn/subgraph/mkldnn_conv.h | 4 +- 5 files changed, 139 insertions(+), 13 deletions(-) create mode 100644 cmake/patches/mkldnn/mem-patch.cmake.patch diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index ed05d885a6..796541f096 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -61,6 +61,7 @@ if (onnxruntime_USE_MKLDNN) set(MKLDNN_INCLUDE_DIR ${MKLDNN_INSTALL}/include) if(NOT onnxruntime_BUILD_FOR_NATIVE_MACHINE) set(MKLDNN_PATCH_COMMAND1 git apply ${CMAKE_SOURCE_DIR}/patches/mkldnn/platform.cmake.patch) + set(MKLDNN_PATCH_COMMAND2 git apply ${CMAKE_SOURCE_DIR}/patches/mkldnn/mem-patch.cmake.patch) # discard prior changes due to patching in mkldnn source to unblock incremental builds. set(MKLDNN_PATCH_DISCARD_COMMAND cd ${MKLDNN_SOURCE} && git checkout -- .) endif() @@ -69,6 +70,7 @@ if (onnxruntime_USE_MKLDNN) GIT_REPOSITORY ${MKLDNN_URL} GIT_TAG ${MKLDNN_TAG} PATCH_COMMAND ${MKLDNN_PATCH_DISCARD_COMMAND} COMMAND ${MKLDNN_PATCH_COMMAND1} + COMMAND ${MKLDNN_PATCH_COMMAND2} SOURCE_DIR ${MKLDNN_SOURCE} CMAKE_ARGS -DMKLDNN_PRODUCT_BUILD_MODE=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL} -DMKLROOT=${MKML_DIR} ) diff --git a/cmake/patches/mkldnn/mem-patch.cmake.patch b/cmake/patches/mkldnn/mem-patch.cmake.patch new file mode 100644 index 0000000000..9d0a2543e6 --- /dev/null +++ b/cmake/patches/mkldnn/mem-patch.cmake.patch @@ -0,0 +1,107 @@ + +--- + src/cpu/jit_avx2_1x1_convolution.cpp | 6 +++--- + src/cpu/jit_avx512_common_1x1_convolution.cpp | 9 ++++----- + src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp | 6 ++++-- + src/cpu/jit_uni_1x1_conv_utils.hpp | 3 ++- + 4 files changed, 13 insertions(+), 11 deletions(-) + +diff --git a/src/cpu/jit_avx2_1x1_convolution.cpp b/src/cpu/jit_avx2_1x1_convolution.cpp +index 46362886..edb2b6fb 100644 +--- a/src/cpu/jit_avx2_1x1_convolution.cpp ++++ b/src/cpu/jit_avx2_1x1_convolution.cpp +@@ -50,7 +50,7 @@ void jit_avx2_1x1_convolution_fwd_t::execute_forward() const { + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + + const auto &jcp = kernel_->jcp; +- auto rtus_space = scratchpad().get(key_conv_rtus_space); ++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad().get(key_conv_rtus_space):NULL; + + const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast; + const int ndims = dst_d.ndims(); +@@ -180,7 +180,7 @@ void jit_avx2_1x1_convolution_bwd_data_t::execute_backward_data() const { + const memory_desc_wrapper diff_src_d(pd()->diff_src_pd()); + + const auto &jcp = kernel_->jcp; +- auto rtus_space = scratchpad().get(key_conv_rtus_space); ++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad().get(key_conv_rtus_space):NULL; + + // TODO (Roma): remove this restriction + assert(jcp.stride_w == 1 && jcp.stride_h == 1); +@@ -306,7 +306,7 @@ void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() const { + const memory_desc_wrapper diff_bias_d(pd()->diff_weights_pd(1)); + + const auto &jcp = kernel_->jcp; +- auto rtus_space = scratchpad.get(key_conv_rtus_space); ++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get(key_conv_rtus_space):NULL; + + data_t *diff_bias = pd()->wants_padded_bias() + ? scratchpad.get(key_conv_padded_bias) : diff_bias_in; +diff --git a/src/cpu/jit_avx512_common_1x1_convolution.cpp b/src/cpu/jit_avx512_common_1x1_convolution.cpp +index 6879cd91..6a32aa49 100644 +--- a/src/cpu/jit_avx512_common_1x1_convolution.cpp ++++ b/src/cpu/jit_avx512_common_1x1_convolution.cpp +@@ -106,7 +106,7 @@ execute_forward_thr(const int ithr, const int nthr, const src_data_t *src, + const memory_desc_wrapper weights_d(pd()->weights_pd(0)); + + const auto &jcp = kernel_->jcp; +- auto rtus_space = scratchpad.get(key_conv_rtus_space); ++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get(key_conv_rtus_space):NULL; + + const int ndims = src_d.ndims(); + const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[0]; +@@ -301,9 +301,8 @@ void jit_avx512_common_1x1_convolution_bwd_data_tdiff_src_pd()); + + const auto &jcp = kernel_->jcp; +- auto rtus_space = scratchpad().template get( +- key_conv_rtus_space); +- ++ auto rtus_space = pd()->rtus_.reduce_src_? scratchpad().template get(key_conv_rtus_space): NULL; ++ + const int ndims = diff_src_d.ndims(); + + // TODO (Roma): remove this restriction +@@ -470,7 +469,7 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights() + + const auto scratchpad = this->scratchpad(); + +- auto rtus_space = scratchpad.get(key_conv_rtus_space); ++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get(key_conv_rtus_space):NULL; + data_t *diff_bias = pd()->wants_padded_bias() + ? scratchpad.get(key_conv_padded_bias) : diff_bias_in; + auto wei_reduction = scratchpad.get(key_conv_wei_reduction); +diff --git a/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp b/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp +index de303cd2..ec0c54e7 100644 +--- a/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp ++++ b/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp +@@ -100,8 +100,10 @@ void jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t + ? types::data_type_size(pd()->desc()->bias_desc.data_type) : 0; + + const auto &jcp = kernel_->jcp; +- auto rtus_space = scratchpad.get(key_conv_rtus_space); +- auto local_scales = scratchpad.get(key_conv_adjusted_scales); ++ ++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get(key_conv_rtus_space):NULL; ++ ++ auto local_scales = scratchpad.get(key_conv_adjusted_scales); + + const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast; + +diff --git a/src/cpu/jit_uni_1x1_conv_utils.hpp b/src/cpu/jit_uni_1x1_conv_utils.hpp +index a3ed769a..5a0e0635 100644 +--- a/src/cpu/jit_uni_1x1_conv_utils.hpp ++++ b/src/cpu/jit_uni_1x1_conv_utils.hpp +@@ -94,7 +94,8 @@ inline void rtus_prepare(conv_pd_t *self, const convolution_desc_t *&conv_d, + template + inline void rtus_prepare_space_info(conv_pd_t *self, + memory_tracking::registrar_t &scratchpad) { +- const auto &jcp = self->jcp_; ++ if (!self->rtus_.reduce_src_) return; ++ const auto &jcp = self->jcp_; + + const int max_threads = mkldnn_get_max_threads(); + const size_t factor = utils::pick_by_prop_kind(self->desc()->prop_kind, +-- +2.17.0.windows.1 + diff --git a/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.cc b/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.cc index a9ea7ca581..501b4efab3 100644 --- a/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.cc +++ b/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.cc @@ -112,13 +112,14 @@ std::shared_ptr MKLDNNExecutionProvider::GetKernelRegistry() con } bool MKLDNNExecutionProvider::UseSubgraph(const onnxruntime::GraphViewer& graph_viewer, - const std::vector& kernel_registries, - std::vector>& result) const { + const std::vector& kernel_registries) const { // switch between mkldnn-vanilla and mkldnn-subgraph implementation using // MKLDNN_SUBGRAPH environment variable bool use_subgraph = true; bool FP16_graph = false; + bool mkldnn_nodes_in_the_graph = false; + if (graph_viewer.MaxNodeIndex() > 0) { int index = 0; auto node = graph_viewer.GetNode(index); @@ -130,16 +131,27 @@ bool MKLDNNExecutionProvider::UseSubgraph(const onnxruntime::GraphViewer& graph_ FP16_graph = node->InputDefs()[0]->Type()->find("16") != std::string::npos; } - if (FP16_graph) { + for (auto node_index = 0; node_index < graph_viewer.MaxNodeIndex(); node_index++) { + auto node = graph_viewer.GetNode(node_index); + if (node == nullptr) { + node_index++; + continue; + } + auto op_it = mkldnn_ops_.find(node->OpType()); + if (op_it != mkldnn_ops_.end()) { + mkldnn_nodes_in_the_graph = true; + break; + } + } + + if (FP16_graph || !mkldnn_nodes_in_the_graph) { // FP16 not supported yet. use_subgraph = false; - result = IExecutionProvider::GetCapability(graph_viewer, kernel_registries); } else { const char* env = getenv("ORT_MKLDNN_SUBGRAPH"); if (env != nullptr) { if (atoi(env) == 0) { use_subgraph = false; - result = IExecutionProvider::GetCapability(graph_viewer, kernel_registries); } } } @@ -209,16 +221,16 @@ std::vector> MKLDNNExecutionProvider::GetCapa const onnxruntime::GraphViewer& graph_viewer, const std::vector& kernel_registries) const { ORT_UNUSED_PARAMETER(kernel_registries); - std::vector> result; // temporary switch to toggle between mkldnn-vanilla and mkldnn-subgraph implementation using // ORT_MKLDNN_SUBGRAPH environment variable - if (UseSubgraph(graph_viewer, kernel_registries, result) == false) { - return result; + if (UseSubgraph(graph_viewer, kernel_registries) == false) { + return IExecutionProvider::GetCapability(graph_viewer, kernel_registries); } LOGS_DEFAULT(INFO) << "Using MKL-DNN Subgraph"; // use sub-graph implementation + std::vector> result; mkl_dnn::Subgraph::SubgraphVariables sub_var; std::shared_ptr subgraph_ptr; @@ -243,6 +255,12 @@ std::vector> MKLDNNExecutionProvider::GetCapa if (IsDimensionSupported(node) == false) { node_index++; + if (subgraph_ptr->mkldnn_nodes.size() > 0) { + CreateMetaDef(graph_viewer, subgraph_attributes, subgraph_ptr, sub_var, result); + subgraph_ptr.reset(new mkl_dnn::Subgraph(graph_name)); + subgraph_attributes.clear(); + output_to_source_node_map.clear(); + } continue; } @@ -436,7 +454,7 @@ Status MKLDNNExecutionProvider::Compile(const std::vector& f compute_info.compute_func = [](FunctionState state, const OrtCustomOpApi* api, OrtKernelContext* context) { onnxruntime::mkl_dnn::MkldnnFuncKernel* custom_op = reinterpret_cast*>(state); - return custom_op->Compute(api, context); + return custom_op->Compute(api, context); }; node_compute_funcs.push_back(compute_info); diff --git a/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.h b/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.h index 3e173e7a79..53dcdf45c6 100644 --- a/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.h +++ b/onnxruntime/core/providers/mkldnn/mkldnn_execution_provider.h @@ -99,8 +99,7 @@ class MKLDNNExecutionProvider : public IExecutionProvider { } bool UseSubgraph(const onnxruntime::GraphViewer& graph_viewer, - const std::vector& kernel_registries, - std::vector>& result) const; + const std::vector& kernel_registries) const; // Some dimensions are not supported by MKL-DNN // example: Pool with NumDimensions <= 3 is not supported diff --git a/onnxruntime/core/providers/mkldnn/subgraph/mkldnn_conv.h b/onnxruntime/core/providers/mkldnn/subgraph/mkldnn_conv.h index ceafa69e5f..9e421ec365 100644 --- a/onnxruntime/core/providers/mkldnn/subgraph/mkldnn_conv.h +++ b/onnxruntime/core/providers/mkldnn/subgraph/mkldnn_conv.h @@ -238,13 +238,13 @@ class MklDnnConv : public MklDnnKernel { if (!bias_dims_mkl.empty()) { fwd_desc_.reset(new mkldnn::convolution_forward::desc( - mkldnn::prop_kind::forward, mkldnn::convolution_direct, *src_md_, + mkldnn::prop_kind::forward_inference, mkldnn::convolution_direct, *src_md_, *filter_md_, *bias_md_, *primitive_dst_md_, strides_mkl, dilations_mkl, padding_left_mkl, padding_right_mkl, mkldnn::padding_kind::zero)); } else { fwd_desc_.reset(new mkldnn::convolution_forward::desc( - mkldnn::prop_kind::forward, mkldnn::convolution_direct, *src_md_, + mkldnn::prop_kind::forward_inference, mkldnn::convolution_direct, *src_md_, *filter_md_, *primitive_dst_md_, strides_mkl, dilations_mkl, padding_left_mkl, padding_right_mkl, mkldnn::padding_kind::zero));