onnxruntime/cmake/patches/mkldnn/mem-patch.cmake.patch
Sreekanth Yalachigere f3c74ec3e9 Reduce memory footprint of MKL-DNN EP (#1429)
* MKL-DNN EP memory fix patch

* Call default provider for Opset10

* opset 10 fix

* removed email header from patch

* UseSubgraph method refactored
2019-07-18 22:57:00 -07:00

107 lines
5.1 KiB
Diff

---
src/cpu/jit_avx2_1x1_convolution.cpp | 6 +++---
src/cpu/jit_avx512_common_1x1_convolution.cpp | 9 ++++-----
src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp | 6 ++++--
src/cpu/jit_uni_1x1_conv_utils.hpp | 3 ++-
4 files changed, 13 insertions(+), 11 deletions(-)
diff --git a/src/cpu/jit_avx2_1x1_convolution.cpp b/src/cpu/jit_avx2_1x1_convolution.cpp
index 46362886..edb2b6fb 100644
--- a/src/cpu/jit_avx2_1x1_convolution.cpp
+++ b/src/cpu/jit_avx2_1x1_convolution.cpp
@@ -50,7 +50,7 @@ void jit_avx2_1x1_convolution_fwd_t::execute_forward() const {
const memory_desc_wrapper weights_d(pd()->weights_pd(0));
const auto &jcp = kernel_->jcp;
- auto rtus_space = scratchpad().get<data_t>(key_conv_rtus_space);
+ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad().get<data_t>(key_conv_rtus_space):NULL;
const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast;
const int ndims = dst_d.ndims();
@@ -180,7 +180,7 @@ void jit_avx2_1x1_convolution_bwd_data_t::execute_backward_data() const {
const memory_desc_wrapper diff_src_d(pd()->diff_src_pd());
const auto &jcp = kernel_->jcp;
- auto rtus_space = scratchpad().get<data_t>(key_conv_rtus_space);
+ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad().get<data_t>(key_conv_rtus_space):NULL;
// TODO (Roma): remove this restriction
assert(jcp.stride_w == 1 && jcp.stride_h == 1);
@@ -306,7 +306,7 @@ void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() const {
const memory_desc_wrapper diff_bias_d(pd()->diff_weights_pd(1));
const auto &jcp = kernel_->jcp;
- auto rtus_space = scratchpad.get<data_t>(key_conv_rtus_space);
+ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get<data_t>(key_conv_rtus_space):NULL;
data_t *diff_bias = pd()->wants_padded_bias()
? scratchpad.get<data_t>(key_conv_padded_bias) : diff_bias_in;
diff --git a/src/cpu/jit_avx512_common_1x1_convolution.cpp b/src/cpu/jit_avx512_common_1x1_convolution.cpp
index 6879cd91..6a32aa49 100644
--- a/src/cpu/jit_avx512_common_1x1_convolution.cpp
+++ b/src/cpu/jit_avx512_common_1x1_convolution.cpp
@@ -106,7 +106,7 @@ execute_forward_thr(const int ithr, const int nthr, const src_data_t *src,
const memory_desc_wrapper weights_d(pd()->weights_pd(0));
const auto &jcp = kernel_->jcp;
- auto rtus_space = scratchpad.get<src_data_t>(key_conv_rtus_space);
+ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get<src_data_t>(key_conv_rtus_space):NULL;
const int ndims = src_d.ndims();
const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[0];
@@ -301,9 +301,8 @@ void jit_avx512_common_1x1_convolution_bwd_data_t<diff_dst_type, wei_type,
const memory_desc_wrapper diff_src_d(pd()->diff_src_pd());
const auto &jcp = kernel_->jcp;
- auto rtus_space = scratchpad().template get<diff_src_data_t>(
- key_conv_rtus_space);
-
+ auto rtus_space = pd()->rtus_.reduce_src_? scratchpad().template get<diff_src_data_t>(key_conv_rtus_space): NULL;
+
const int ndims = diff_src_d.ndims();
// TODO (Roma): remove this restriction
@@ -470,7 +469,7 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights()
const auto scratchpad = this->scratchpad();
- auto rtus_space = scratchpad.get<data_t>(key_conv_rtus_space);
+ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get<data_t>(key_conv_rtus_space):NULL;
data_t *diff_bias = pd()->wants_padded_bias()
? scratchpad.get<data_t>(key_conv_padded_bias) : diff_bias_in;
auto wei_reduction = scratchpad.get<data_t>(key_conv_wei_reduction);
diff --git a/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp b/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp
index de303cd2..ec0c54e7 100644
--- a/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp
+++ b/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp
@@ -100,8 +100,10 @@ void jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<src_type, dst_type>
? types::data_type_size(pd()->desc()->bias_desc.data_type) : 0;
const auto &jcp = kernel_->jcp;
- auto rtus_space = scratchpad.get<src_data_t>(key_conv_rtus_space);
- auto local_scales = scratchpad.get<float>(key_conv_adjusted_scales);
+
+ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get<src_data_t>(key_conv_rtus_space):NULL;
+
+ auto local_scales = scratchpad.get<float>(key_conv_adjusted_scales);
const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast;
diff --git a/src/cpu/jit_uni_1x1_conv_utils.hpp b/src/cpu/jit_uni_1x1_conv_utils.hpp
index a3ed769a..5a0e0635 100644
--- a/src/cpu/jit_uni_1x1_conv_utils.hpp
+++ b/src/cpu/jit_uni_1x1_conv_utils.hpp
@@ -94,7 +94,8 @@ inline void rtus_prepare(conv_pd_t *self, const convolution_desc_t *&conv_d,
template <typename conv_pd_t>
inline void rtus_prepare_space_info(conv_pd_t *self,
memory_tracking::registrar_t &scratchpad) {
- const auto &jcp = self->jcp_;
+ if (!self->rtus_.reduce_src_) return;
+ const auto &jcp = self->jcp_;
const int max_threads = mkldnn_get_max_threads();
const size_t factor = utils::pick_by_prop_kind(self->desc()->prop_kind,
--
2.17.0.windows.1