mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-04 23:59:56 +00:00
* Fix unnecessary memory allocation in MKLDNN 1x1 convolution. * remove the patch header.
125 lines
6.1 KiB
Diff
125 lines
6.1 KiB
Diff
cmake/external_mkldnn.cmake | 1 +
|
|
cmake/mkldnn_fix_memory.patch | 99 +++++++++++++++++++++++++++++++++++
|
|
2 files changed, 100 insertions(+)
|
|
create mode 100644 cmake/mkldnn_fix_memory.patch
|
|
|
|
diff --git a/cmake/external_mkldnn.cmake b/cmake/external_mkldnn.cmake
|
|
index 7874aca76..bbae6d1a4 100644
|
|
--- a/cmake/external_mkldnn.cmake
|
|
+++ b/cmake/external_mkldnn.cmake
|
|
@@ -194,6 +194,7 @@ if (WIN32)
|
|
CONFIGURE_COMMAND
|
|
PATCH_COMMAND ${MKLDNN_PATCH_REVERT_COMMAND}
|
|
COMMAND git apply --ignore-space-change --ignore-whitespace ${CMAKE_SOURCE_DIR}/cmake/${MKLDNN_PATCH_FILE}
|
|
+ COMMAND git apply --ignore-space-change --ignore-whitespace ${CMAKE_SOURCE_DIR}/cmake/mkldnn_fix_memory.patch
|
|
CMAKE_GENERATOR ${CMAKE_GENERATOR}
|
|
CMAKE_GENERATOR_PLATFORM ${CMAKE_GENERATOR_PLATFORM}
|
|
CMAKE_GENERATOR_TOOLSET ${CMAKE_GENERATOR_TOOLSET}
|
|
diff --git a/cmake/mkldnn_fix_memory.patch b/cmake/mkldnn_fix_memory.patch
|
|
new file mode 100644
|
|
index 000000000..ea1a3bd61
|
|
--- /dev/null
|
|
+++ b/cmake/mkldnn_fix_memory.patch
|
|
@@ -0,0 +1,99 @@
|
|
+ src/cpu/jit_avx2_1x1_convolution.cpp | 6 +++---
|
|
+ src/cpu/jit_avx512_common_1x1_convolution.cpp | 9 +++++----
|
|
+ src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp | 2 +-
|
|
+ src/cpu/jit_uni_1x1_conv_utils.hpp | 1 +
|
|
+ 4 files changed, 10 insertions(+), 8 deletions(-)
|
|
+
|
|
+diff --git a/src/cpu/jit_avx2_1x1_convolution.cpp b/src/cpu/jit_avx2_1x1_convolution.cpp
|
|
+index 46362886..edb2b6fb 100644
|
|
+--- a/src/cpu/jit_avx2_1x1_convolution.cpp
|
|
++++ b/src/cpu/jit_avx2_1x1_convolution.cpp
|
|
+@@ -50,7 +50,7 @@ void jit_avx2_1x1_convolution_fwd_t::execute_forward() const {
|
|
+ const memory_desc_wrapper weights_d(pd()->weights_pd(0));
|
|
+
|
|
+ const auto &jcp = kernel_->jcp;
|
|
+- auto rtus_space = scratchpad().get<data_t>(key_conv_rtus_space);
|
|
++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad().get<data_t>(key_conv_rtus_space):NULL;
|
|
+
|
|
+ const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast;
|
|
+ const int ndims = dst_d.ndims();
|
|
+@@ -180,7 +180,7 @@ void jit_avx2_1x1_convolution_bwd_data_t::execute_backward_data() const {
|
|
+ const memory_desc_wrapper diff_src_d(pd()->diff_src_pd());
|
|
+
|
|
+ const auto &jcp = kernel_->jcp;
|
|
+- auto rtus_space = scratchpad().get<data_t>(key_conv_rtus_space);
|
|
++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad().get<data_t>(key_conv_rtus_space):NULL;
|
|
+
|
|
+ // TODO (Roma): remove this restriction
|
|
+ assert(jcp.stride_w == 1 && jcp.stride_h == 1);
|
|
+@@ -306,7 +306,7 @@ void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() const {
|
|
+ const memory_desc_wrapper diff_bias_d(pd()->diff_weights_pd(1));
|
|
+
|
|
+ const auto &jcp = kernel_->jcp;
|
|
+- auto rtus_space = scratchpad.get<data_t>(key_conv_rtus_space);
|
|
++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get<data_t>(key_conv_rtus_space):NULL;
|
|
+
|
|
+ data_t *diff_bias = pd()->wants_padded_bias()
|
|
+ ? scratchpad.get<data_t>(key_conv_padded_bias) : diff_bias_in;
|
|
+diff --git a/src/cpu/jit_avx512_common_1x1_convolution.cpp b/src/cpu/jit_avx512_common_1x1_convolution.cpp
|
|
+index 6879cd91..47cea4f4 100644
|
|
+--- a/src/cpu/jit_avx512_common_1x1_convolution.cpp
|
|
++++ b/src/cpu/jit_avx512_common_1x1_convolution.cpp
|
|
+@@ -106,7 +106,7 @@ execute_forward_thr(const int ithr, const int nthr, const src_data_t *src,
|
|
+ const memory_desc_wrapper weights_d(pd()->weights_pd(0));
|
|
+
|
|
+ const auto &jcp = kernel_->jcp;
|
|
+- auto rtus_space = scratchpad.get<src_data_t>(key_conv_rtus_space);
|
|
++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get<src_data_t>(key_conv_rtus_space):NULL;
|
|
+
|
|
+ const int ndims = src_d.ndims();
|
|
+ const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[0];
|
|
+@@ -301,8 +301,9 @@ void jit_avx512_common_1x1_convolution_bwd_data_t<diff_dst_type, wei_type,
|
|
+ const memory_desc_wrapper diff_src_d(pd()->diff_src_pd());
|
|
+
|
|
+ const auto &jcp = kernel_->jcp;
|
|
+- auto rtus_space = scratchpad().template get<diff_src_data_t>(
|
|
+- key_conv_rtus_space);
|
|
++ auto rtus_space = pd()->rtus_.reduce_src_
|
|
++ ? scratchpad().template get<diff_src_data_t>(key_conv_rtus_space)
|
|
++ : NULL;
|
|
+
|
|
+ const int ndims = diff_src_d.ndims();
|
|
+
|
|
+@@ -470,7 +471,7 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights()
|
|
+
|
|
+ const auto scratchpad = this->scratchpad();
|
|
+
|
|
+- auto rtus_space = scratchpad.get<data_t>(key_conv_rtus_space);
|
|
++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get<data_t>(key_conv_rtus_space):NULL;
|
|
+ data_t *diff_bias = pd()->wants_padded_bias()
|
|
+ ? scratchpad.get<data_t>(key_conv_padded_bias) : diff_bias_in;
|
|
+ auto wei_reduction = scratchpad.get<data_t>(key_conv_wei_reduction);
|
|
+diff --git a/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp b/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp
|
|
+index de303cd2..8129f2b2 100644
|
|
+--- a/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp
|
|
++++ b/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp
|
|
+@@ -100,7 +100,7 @@ void jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<src_type, dst_type>
|
|
+ ? types::data_type_size(pd()->desc()->bias_desc.data_type) : 0;
|
|
+
|
|
+ const auto &jcp = kernel_->jcp;
|
|
+- auto rtus_space = scratchpad.get<src_data_t>(key_conv_rtus_space);
|
|
++ auto rtus_space = pd()->rtus_.reduce_src_?scratchpad.get<src_data_t>(key_conv_rtus_space):NULL;
|
|
+ auto local_scales = scratchpad.get<float>(key_conv_adjusted_scales);
|
|
+
|
|
+ const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast;
|
|
+diff --git a/src/cpu/jit_uni_1x1_conv_utils.hpp b/src/cpu/jit_uni_1x1_conv_utils.hpp
|
|
+index a3ed769a..6d76ba56 100644
|
|
+--- a/src/cpu/jit_uni_1x1_conv_utils.hpp
|
|
++++ b/src/cpu/jit_uni_1x1_conv_utils.hpp
|
|
+@@ -94,6 +94,7 @@ inline void rtus_prepare(conv_pd_t *self, const convolution_desc_t *&conv_d,
|
|
+ template <typename conv_pd_t>
|
|
+ inline void rtus_prepare_space_info(conv_pd_t *self,
|
|
+ memory_tracking::registrar_t &scratchpad) {
|
|
++ if (!self->rtus_.reduce_src_) return;
|
|
+ const auto &jcp = self->jcp_;
|
|
+
|
|
+ const int max_threads = mkldnn_get_max_threads();
|
|
+--
|
|
+2.20.1.windows.1
|
|
+
|
|
--
|
|
2.20.1.windows.1
|
|
|