One dnn v2.6 update (#11220)

* Disable training code in DNNL LayerNorm code The capability code already does not claim the LayerNorm and SkipLayerNorm that require more than one output. However, building with training enabled was causing issues. The training specific code has been removed even when building with training enabled. Signed-off-by: George Nash <george.nash@intel.com> * Fix for DNNL FusedMatMul op. The bug was in the transpose code. Signed-off-by: Chethan Palangotu Keshava <chethan.palangotu.keshava@intel.com> * Use agreed upon memory format type when runnig Pooling Gradient in dnnl ep The dnnl ep does not currently have a way to pass memory_format information between the forward pooling primitive to the backward pooling primitive. This change explicitly sets the memory_format to use match that of Onnxruntime. For both the forward and backward pooling code. This will prevent using un-matched memory format that could result in an `unimplemented` error from dnnl ep. Signed-off-by: George Nash <george.nash@intel.com> * Update dnnl ep to use OneDNN v2.6 Do not run ReduceInfLogSum on the kDnnlExecutionProvider due to a calculation bug when doing Log or infinity valuse. The fix for this issue will be part of the next OneDNN release. Signed-off-by: George Nash <george.nash@intel.com> * Update PrintMemory function in dnnl ep This modification can be used to enable/disable memory printing for dnnl ep develpers. This is considered a developer only feature and is disabled by default. It must be enabled and code recompiled to use. Even if it is enabled it will not actually print any memory because the developer needs to take the extra step of spefifying the memory that will be printed to the screen. Signed-off-by: George Nash <george.nash@intel.com> * Update binary ops to run on intel GPU when using dnnl ep Binary ops (i.e. Add, Div, Mul, and Sub ) was updated to no longer call GetMemoryAndReshape in the past this would move the memory from CPU to the GPU. This extra call is no longer needed since it is taken care of by the GetMemoryInOrtFormat call. Removing the GetMemoryAndReshape prevented copying the memory to GPU twice. Signed-off-by: George Nash <george.nash@intel.com> Co-authored-by: Chethan Palangotu Keshava <chethan.palangotu.keshava@intel.com>
2026-07-21 19:18:55 +00:00 · 2022-04-15 12:51:11 -07:00 · 2022-04-15 12:51:11 -07:00 · d9eeb48393
commit d9eeb48393
parent 227bc7264e
9 changed files with 125 additions and 78 deletions
--- a/cmake/external/dnnl.cmake
+++ b/cmake/external/dnnl.cmake
@ -2,7 +2,7 @@ include (ExternalProject)

 set(DNNL_URL https://github.com/oneapi-src/onednn.git)
 # If DNNL_TAG is updated, check if MKLML_VERSION and platform.cmake.patch need to be updated.
-set(DNNL_TAG v2.4.4)
+set(DNNL_TAG v2.6)

 if(WIN32)
  set(DNNL_SHARED_LIB dnnl.dll)
--- a/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc
+++ b/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc
@ -489,10 +489,7 @@ bool DnnlSumNodeCapability::IsDimensionSupported(const Node* node) const {
 bool DnnlBinaryNodeCapability::Supported(const Node* node, const GraphViewer& graph_viewer) const {
  ORT_UNUSED_PARAMETER(graph_viewer);
  if (!IsTypeSupported(node)) return false;
-  //gpu broadcast for source 0 not supported
-  if (dnnl_engine_get_count(dnnl_engine_kind_t::dnnl_gpu)) {
-    return false;
-  }
+
  return true;
 }

--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_binary.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_binary.cc
@ -24,8 +24,10 @@ void DnnlBinary::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
  }

  // GetMemory in OrtFormat. Broadcasting and mix format binary ops can result in computation failure
-  auto src_0_ori_md = sp.GetMemoryInOrtFormat(node.Input(IN_A), eng).get_desc();
-  auto src_1_ori_md = sp.GetMemoryInOrtFormat(node.Input(IN_B), eng).get_desc();
+  auto binary_src0_mem = sp.GetMemoryInOrtFormat(node.Input(IN_A), eng);
+  auto binary_src1_mem = sp.GetMemoryInOrtFormat(node.Input(IN_B), eng);
+  auto src_0_ori_md = binary_src0_mem.get_desc();
+  auto src_1_ori_md = binary_src1_mem.get_desc();

  auto src_0_dims = src_0_ori_md.dims();
  auto src_1_dims = src_1_ori_md.dims();
@ -53,11 +55,6 @@ void DnnlBinary::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
  auto binary_d = dnnl::binary::desc(algo, src_0_md, src_1_md, dst_md);
  auto binary_pd = dnnl::binary::primitive_desc(binary_d, eng);

-  auto binary_src0_mem = sp.GetMemoryAndReshape(node.Input(IN_A), binary_pd.src0_desc(), eng);
-  auto binary_src1_mem = sp.GetMemoryAndReshape(node.Input(IN_B), binary_pd.src1_desc(), eng);
-
-
-
  auto binary_dst_mem = dnnl::memory(binary_pd.dst_desc(), eng);
  auto binary_prim = dnnl::binary(binary_pd);

--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_layernorm.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_layernorm.cc
@ -108,7 +108,7 @@ void DnnlLayerNorm::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
  // X = LayerNornm(X)
  // Check if we are training and need the extra outputs for backprop
  dnnl::prop_kind prop_kind;
-#if defined(ENABLE_TRAINING)
+#if 0 //defined(ENABLE_TRAINING)
  prop_kind = dnnl::prop_kind::forward_training;
 #else
  prop_kind = dnnl::prop_kind::forward_inference;
@ -146,18 +146,20 @@ void DnnlLayerNorm::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
  }

 // Check outputs used for training
-#if defined(ENABLE_TRAINING)
+#if 0 //defined(ENABLE_TRAINING)
  // If Mean exists
-  if (node.Output(OUT_MEAN).Exists()) {
-    auto mean_mem = dnnl::memory(lnorm_pd.mean_desc(), dnnl_engine);
-    lnorm_args.insert({DNNL_ARG_MEAN, mean_mem});
-    sp.SetMemory(node.Output(OUT_MEAN), mean_mem);
-  }
-  // If Variance exists
-  if (node.Output(OUT_INV_STD_VAR).Exists()) {
-    auto variance_mem = dnnl::memory(lnorm_pd.variance_desc(), dnnl_engine);
-    lnorm_args.insert({DNNL_ARG_VARIANCE, variance_mem});
-    sp.SetMemory(node.Output(OUT_INV_STD_VAR), variance_mem);
+  if (node.OutputCount() > 1) {
+    if (node.Output(OUT_MEAN).Exists()) {
+      auto mean_mem = dnnl::memory(lnorm_pd.mean_desc(), dnnl_engine);
+      lnorm_args.insert({DNNL_ARG_MEAN, mean_mem});
+      sp.SetMemory(node.Output(OUT_MEAN), mean_mem);
+    }
+    // If Variance exists
+    if (node.Output(OUT_INV_STD_VAR).Exists()) {
+      auto variance_mem = dnnl::memory(lnorm_pd.variance_desc(), dnnl_engine);
+      lnorm_args.insert({DNNL_ARG_VARIANCE, variance_mem});
+      sp.SetMemory(node.Output(OUT_INV_STD_VAR), variance_mem);
+    }
  }
 #endif  // ENABLE_TRAINING

--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul.cc
@ -66,85 +66,91 @@ void DnnlMatMul::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {


  //Holds transposed matrices A and B. ToDo: Eliminate its usage if in place transpose is possbile for FusedMatmul
-  dnnl::memory::desc intermediateA_md;
-  dnnl::memory intermediateA_mem;
+  dnnl::memory::desc transposedA_md;
+  dnnl::memory transposedA_mem;

-  dnnl::memory::desc intermediateB_md;
-  dnnl::memory intermediateB_mem;
+  dnnl::memory::desc transposedB_md;
+  dnnl::memory transposedB_mem;

  if (is_fusedmatmul)
  {
    if (transA || transBatchA) {
      dnnl::memory::dims strides = GetStrides(dataA_dims, transA, transBatchA, transposedA_dims);

-      intermediateA_md = dnnl::memory::desc(dataA_dims, node.Input(IN_A).Type(), strides);
-      intermediateA_mem = dnnl::memory(intermediateA_md, eng);
+      dnnl::memory::desc intermediateA_md = dnnl::memory::desc(dataA_dims, node.Input(IN_A).Type(), strides);
+      dnnl::memory intermediateA_mem = dnnl::memory(intermediateA_md, eng);

      auto traspose_primitive = dnnl::reorder(dataA_mem, intermediateA_mem);
-      sp.AddPrimitive(traspose_primitive, {{DNNL_ARG_FROM, dataA_mem},
-                                           {DNNL_ARG_TO, intermediateA_mem}});
+      sp.AddPrimitive(traspose_primitive, {{DNNL_ARG_FROM, dataA_mem}, {DNNL_ARG_TO, intermediateA_mem}});

-      // The reorder from above will get the memory in the right order. The next few lines will create a memory and memory descriptor
-      // that will have the correct dimentions and correct memory::format
-      dnnl::memory::desc transposed_md = dnnl::memory::desc(transposedA_dims, node.Input(IN_A).Type(), sp.GetDnnlFormat(dataA_dims.size()));
-      dnnl::memory transposed_mem = dnnl::memory(transposed_md, eng, nullptr);
-      void* handle = intermediateA_mem.get_data_handle();
-      transposed_mem.set_data_handle(handle);
      while (transposedA_dims.size() < weights_dims.size()) {
        transposedA_dims.insert(transposedA_dims.begin(), 1);
      }
+
+      // The reorder from above will get the memory in the right order. The next few lines will create a memory and memory descriptor
+      // that will have the correct dimentions and correct memory::format
+      transposedA_md = dnnl::memory::desc(transposedA_dims, node.Input(IN_A).Type(), sp.GetDnnlFormat(transposedA_dims.size()));
+      transposedA_mem = dnnl::memory(transposedA_md, eng, nullptr);
+      void* handle = intermediateA_mem.get_data_handle();
+      transposedA_mem.set_data_handle(handle);
    }
    if (transB || transBatchB) {                // Exact same logic for matrix B as used for matrix A
      dnnl::memory::dims strides = GetStrides(dataB_dims, transB, transBatchB, transposedB_dims);

-      intermediateB_md = dnnl::memory::desc(dataB_dims, node.Input(IN_B).Type(), strides);
-      intermediateB_mem = dnnl::memory(intermediateB_md, eng);
+      dnnl::memory::desc intermediateB_md = dnnl::memory::desc(dataB_dims, node.Input(IN_B).Type(), strides);
+      dnnl::memory intermediateB_mem = dnnl::memory(intermediateB_md, eng);

      auto traspose_primitive = dnnl::reorder(dataB_mem, intermediateB_mem);
-      sp.AddPrimitive(traspose_primitive, {{DNNL_ARG_FROM, dataB_mem},
-                                           {DNNL_ARG_TO, intermediateB_mem}});
+      sp.AddPrimitive(traspose_primitive, {{DNNL_ARG_FROM, dataB_mem}, {DNNL_ARG_TO, intermediateB_mem}});

-      // The reorder from above will get the memory in the right order. The next few lines will create a memory and memory descriptor
-      // that will have the correct dimentions and correct memory::format
-      dnnl::memory::desc transposed_md = dnnl::memory::desc(transposedB_dims, node.Input(IN_A).Type(), sp.GetDnnlFormat(dataB_dims.size()));
-      dnnl::memory transposed_mem = dnnl::memory(transposed_md, eng, nullptr);
-      void* handle = intermediateB_mem.get_data_handle();
-      transposed_mem.set_data_handle(handle);
      while (src_dims.size() > transposedB_dims.size()) {
        transposedB_dims.insert(transposedB_dims.begin(), 1);
      }
+
+      // The reorder from above will get the memory in the right order. The next few lines will create a memory and memory descriptor
+      // that will have the correct dimentions and correct memory::format
+      transposedB_md = dnnl::memory::desc(transposedB_dims, node.Input(IN_B).Type(), sp.GetDnnlFormat(transposedB_dims.size()));
+      transposedB_mem = dnnl::memory(transposedB_md, eng, nullptr);
+      void* handle = intermediateB_mem.get_data_handle();
+      transposedB_mem.set_data_handle(handle);
+      
    }
  }

  dnnl::memory::desc src_md;
  if (transA || transBatchA) {
-    src_md = dnnl::memory::desc(transposedA_dims, node.Input(IN_A).Type(), dnnl::memory::format_tag::any);
+    src_md = transposedA_md;
  } else {
    src_md = dnnl::memory::desc(src_dims, node.Input(IN_A).Type(), dnnl::memory::format_tag::any);
  }
    
  dnnl::memory::desc weights_md;
  if (transB || transBatchB) {
-    weights_md = dnnl::memory::desc(transposedB_dims, node.Input(IN_B).Type(), dnnl::memory::format_tag::any);
+    weights_md = transposedB_md;
  } else {
    weights_md = dnnl::memory::desc(weights_dims, node.Input(IN_B).Type(), dnnl::memory::format_tag::any);
  }
    

  auto output_shape = src_dims;
-  if (transA || transBatchA)
+  if (transA || transBatchA) {
    output_shape = transposedA_dims;
+  }
  output_shape.pop_back();
-  if (transB || transBatchB)
+  if (transB || transBatchB) {
    output_shape.emplace_back(transposedB_dims.back());
-  else
+  } else {
    output_shape.emplace_back(weights_dims.back());
+  }
+    
  for (size_t i = 0; i < output_shape.size() - 2; i++) {
    if (output_shape[i] == 1) {
-      if (transB || transBatchB)
+        if (transB || transBatchB) {
        output_shape[i] = transposedB_dims[i];
-      else
-        output_shape[i] = weights_dims[i];
+      } else {
+          output_shape[i] = weights_dims[i];
+        }
+        
    }
  }

@ -195,12 +201,12 @@ void DnnlMatMul::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
  auto matmul_prim = dnnl::matmul(matmul_pd);

  if (transA || transBatchA) {
-    matmul_src_mem = intermediateA_mem;
+    matmul_src_mem = transposedA_mem;
  } else {
    matmul_src_mem = sp.GetMemoryAndReshape(node.Input(IN_A), matmul_pd.src_desc(), eng);
  }
  if (transB || transBatchB) {
-    matmul_weights_mem = intermediateB_mem;
+    matmul_weights_mem = transposedB_mem;
  } else {
    matmul_weights_mem = sp.GetMemoryAndReshape(node.Input(IN_B), matmul_pd.weights_desc(), eng);
  }
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_pool.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_pool.cc
@ -12,8 +12,15 @@ DnnlPool::DnnlPool() {}

 void DnnlPool::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
  auto dnnl_engine = sp.GetEngine();
-
+#ifdef ENABLE_TRAINING
+  // When using training the memory needs to be in a format known to pool_forward and the
+  // pool_backward primitives. Since we don't currently have a way to pass the memory format
+ // from pool_forward to pool_backward; we are choosing to use Onnxruntime's memory format
+ // as the common memory format to be used by both forward and the backward primitives.
+ auto pool_src_mem = sp.GetMemoryInOrtFormat(node.Input(IN_X), dnnl_engine);
+#else
  auto pool_src_mem = sp.GetMemory(node.Input(IN_X));
+#endif  // ENABLE_TRAINING
  auto src_md = pool_src_mem.get_desc();
  auto src_dims = pool_src_mem.get_desc().dims();

@ -51,8 +58,10 @@ void DnnlPool::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {

  auto pool_pd = dnnl::pooling_forward::primitive_desc(pool_desc, dnnl_engine);

+#ifndef ENABLE_TRAINING
  // If using GPU this will move the memory from the CPU to the GPU.
  pool_src_mem = sp.GetMemoryAndReshape(node.Input(IN_X), pool_pd.src_desc(), dnnl_engine);
+#endif
  dnnl::memory pool_dst_mem = dnnl::memory(pool_pd.dst_desc(), dnnl_engine);

  auto pool_op = dnnl::pooling_forward(pool_pd);
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_poolgrad.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_poolgrad.cc
@ -74,6 +74,7 @@ void DnnlPoolGrad::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {

  auto dx_dims = node.Output(OUT_DX).Dim();
  dnnl::memory::desc dx_md(dx_dims, node.Input(IN_DY).Type(), dnnl::memory::format_tag::any);
+  dnnl::memory::desc fwd_dx_md(dx_dims, node.Input(IN_DY).Type(), sp.GetDnnlFormat(dx_dims.size()));

  //Read the attributes
  auto kernel_shape = GetKernelShape(node);
@ -92,7 +93,7 @@ void DnnlPoolGrad::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
  }

  dnnl::pooling_forward::desc pool_forward_desc(dnnl::prop_kind::forward, algo,
-                                                dx_md, dy_md,
+                                                fwd_dx_md, dy_md,
                                                strides, kernel_shape,
                                                padding_left, padding_right);
  dnnl::pooling_forward::primitive_desc pool_forward_pd(pool_forward_desc, dnnl_engine);
@ -110,9 +111,9 @@ void DnnlPoolGrad::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {

  dnnl::memory dx_mem(pool_backward_pd.diff_src_desc(), dnnl_engine);
  if (maxpoolgrad_optype) {
-  sp.AddPrimitive(pool_backward_op, {{DNNL_ARG_DIFF_DST, dy_mem},
-                                     {DNNL_ARG_DIFF_SRC, dx_mem},
-                                     {DNNL_ARG_WORKSPACE, indices_mem}});
+    sp.AddPrimitive(pool_backward_op, {{DNNL_ARG_DIFF_DST, dy_mem},
+                                       {DNNL_ARG_DIFF_SRC, dx_mem},
+                                       {DNNL_ARG_WORKSPACE, indices_mem}});
  } else {
    sp.AddPrimitive(pool_backward_op, {{DNNL_ARG_DIFF_DST, dy_mem},
                                       {DNNL_ARG_DIFF_SRC, dx_mem}});
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.cc
@ -36,7 +36,35 @@

 #include <inttypes.h>
 #include <stdio.h>
+#include <iostream>
+#include <iomanip>

+/*
+* The DNNL_TENSOR_PRINT_MEMORY should always be 0 unless debugging
+*
+* These macros can be used to print the contents of a OneDNN tensor
+* This can be used to debug and investigate the values inputs and outputs
+* of OneDNN ops.
+*
+* To use set DNNL_TENSOR_PRINT_MEMORY to 1
+* Find the operator you want to investigate and add the memory you want to print when
+* calling AddPrimitive() for example:
+* Change this code:
+* ```
+* sp.AddPrimitive(elemenwise_primitive,
+                  {{DNNL_ARG_SRC, elementwise_src_mem}, {DNNL_ARG_DST, elementwise_dst_mem}});
+* ```
+* to
+* ```
+* sp.AddPrimitive(elemenwise_primitive,
+                  {{DNNL_ARG_SRC, elementwise_src_mem}, {DNNL_ARG_DST, elementwise_dst_mem}},
+                  {DNNL_ARG_SRC, DNNL_ARG_DST});
+* ```
+* Then rebuild and run the code.
+* This is a developer only solution to investigating contents of OneDNN's tensors.
+*/
+#define DNNL_TENSOR_PRINT_MEMORY 0
+#define DNNL_TENSOR_PRINT_MEMORY_MAX_TENSOR_ELEMENTS 150

 namespace onnxruntime {
 namespace ort_dnnl {
@ -46,12 +74,12 @@ inline bool Contains(const Map& map, const Key& key) {
  return map.find(key) != map.end();
 }

-
+#if DNNL_TENSOR_PRINT_MEMORY
 void DnnlSubgraphPrimitive::PrintMemory(const dnnl::memory& mem) {
  auto md = mem.get_desc();
  auto dt = md.data_type();
  auto dims = md.dims();
-  if (Product(dims) > 50) {
+  if (Product(dims) > DNNL_TENSOR_PRINT_MEMORY_MAX_TENSOR_ELEMENTS) {
    printf("tensor too long ignore printing \n");
    return;
  }
@ -75,10 +103,12 @@ void DnnlSubgraphPrimitive::PrintMemory(const dnnl::memory& mem) {
      ((char*)data_vec.data())[i] = ((char*)dh)[i];
    }

+    std::cout << "[";
    for (auto& data : data_vec) {
-      printf("%.6f ", data);
+      std::cout << std::setprecision(6) << data;
+      if (&data != &data_vec.back()) std::cout << ", ";
    }
-    printf("\n");
+    std::cout << "]\n";
  }
  else if (dt == dnnl::memory::data_type::u8) {
    std::vector<uint8_t> data_vec(Product(dims));
@ -87,10 +117,12 @@ void DnnlSubgraphPrimitive::PrintMemory(const dnnl::memory& mem) {
      ((char*)data_vec.data())[i] = ((char*)dh)[i];
    }

+    std::cout << "[";
    for (auto& data : data_vec) {
-      printf("%" PRIu8 "\n", data);
+      std::cout << +data;
+      if (&data != &data_vec.back()) std::cout << ", ";
    }
-    printf("\n");
+    std::cout << "]\n";
  } else if (dt == dnnl::memory::data_type::s8) {
    std::vector<int8_t> data_vec(Product(dims));
    auto dh = to_mem.get_data_handle();
@ -98,10 +130,12 @@ void DnnlSubgraphPrimitive::PrintMemory(const dnnl::memory& mem) {
      ((char*)data_vec.data())[i] = ((char*)dh)[i];
    }

+    std::cout << "[";
    for (auto& data : data_vec) {
-      printf("%" PRIi8 "\n", data);
+      std::cout << +data;
+      if (&data != &data_vec.back()) std::cout << ", ";
    }
-    printf("\n");
+    std::cout << "]\n";
  } else if (dt == dnnl::memory::data_type::s32) {
    std::vector<int32_t> data_vec(Product(dims));
    auto dh = to_mem.get_data_handle();
@ -109,14 +143,17 @@ void DnnlSubgraphPrimitive::PrintMemory(const dnnl::memory& mem) {
    ((char*)data_vec.data())[i] = ((char*)dh)[i];
    }

+    std::cout << "[";
    for (auto& data : data_vec) {
-      printf("%" PRIi32 "\n", data);
+      std::cout << data;
+      if (&data != &data_vec.back()) std::cout << ", ";
    }
-    printf("\n");
+    std::cout << "]\n";
  } else {
    ORT_THROW("Cannot print such data type");
  }
 }
+#endif // DNNL_TENSOR_PRINT_MEMORY

 int Product(dnnl::memory::dims d) {
  int result = 1;
@ -647,9 +684,8 @@ onnxruntime::common::Status DnnlSubgraphPrimitive::Predict(const std::unordered_
  for (size_t i = 0; i < net_.size(); ++i) {
    net_.at(i).execute(stream, net_args_.at(i));
    stream.wait();
-    
+#if DNNL_TENSOR_PRINT_MEMORY
    //for debug memory purpose
-    /*
    for (auto e : items_to_print_) {
      auto net_index = e.first;
      auto net_arg_index = e.second;
@ -657,8 +693,7 @@ onnxruntime::common::Status DnnlSubgraphPrimitive::Predict(const std::unordered_
        PrintMemory(net_args_.at(i)[net_arg_index]);
      }
    }
-    */
-    
+#endif  //DNNL_TENSOR_PRINT_MEMORY
  }

  return Status::OK();
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@ -2749,7 +2749,7 @@ TEST(ReductionOpTest, ReduceInfLogSum) {
                        {FLOAT_INF, FLOAT_INF,
                         -std::numeric_limits<float>::quiet_NaN(), std::numeric_limits<float>::quiet_NaN(),
                         std::numeric_limits<float>::quiet_NaN(), std::numeric_limits<float>::quiet_NaN()});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kDnnlExecutionProvider});
 }

 TEST(ReductionOpTest, ReduceInfLogSumExp) {