From a92ccbe1bc015d0b679b08f0c623d4be164ff7b3 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Fri, 9 Oct 2020 22:34:32 +1000
Subject: [PATCH] Various armv7 related fixes (#5394)

* - Link with libatomic if needed
 - Install pip differently so it doesn't clash with the system pip which may involve a wrapper script
 - Remove ability to specify offset when Tensor allocates the data. The data prior to offset isn't accessible by anything.
 - Fix use of offset in TensorOpTest to work on armv7 where it must be aligned to the type it points to.
 - Fix ActivationOpNoInfTest.Softsign to allow for armv7 behavior
 - Fix ReductionOpTest.ReduceMean_*keepdims to allow for armv7 floating point inaccuracy

* Address PR comments
---
 cmake/CMakeLists.txt                          |  20 ++++
 cmake/onnxruntime_common.cmake                |   8 +-
 dockerfiles/Dockerfile.arm32v7                |  27 +++--
 include/onnxruntime/core/framework/tensor.h   |   5 +-
 onnxruntime/core/framework/sparse_tensor.cc   |   6 +-
 onnxruntime/core/framework/tensor.cc          |   7 +-
 onnxruntime/test/framework/tensor_test.cc     |  58 +++++-----
 .../cpu/activation/activation_op_test.cc      | 108 +++++++++++-------
 .../cpu/reduction/reduction_ops_test.cc       |  25 +++-
 9 files changed, 163 insertions(+), 101 deletions(-)
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 8aee11ffaf..6f25b2f4d2 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -305,6 +305,26 @@ if(onnxruntime_DISABLE_EXCEPTIONS)
   endif()
 endif()
 
+# We need to link with libatomic on systems that do not have built-in atomics, or
+# don't have built-in support for 8 byte atomics
+# Derived from https://github.com/protocolbuffers/protobuf/blob/master/cmake/CMakeLists.txt
+set(onnxruntime_LINK_LIBATOMIC false)
+if (NOT MSVC)
+  include(CheckCXXSourceCompiles)
+  set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+  set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} -std=c++11)
+  check_cxx_source_compiles("
+    #include <atomic>
+    int main() {
+      return std::atomic<int64_t>{};
+    }
+  " onnxruntime_HAVE_BUILTIN_ATOMICS)
+  if (NOT onnxruntime_HAVE_BUILTIN_ATOMICS)
+    set(onnxruntime_LINK_LIBATOMIC true)
+  endif ()
+  set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
+endif ()
+
 set(REPO_ROOT ${PROJECT_SOURCE_DIR}/..)
 set(ONNXRUNTIME_ROOT ${PROJECT_SOURCE_DIR}/../onnxruntime)
 set(ORTTRAINING_ROOT ${PROJECT_SOURCE_DIR}/../orttraining)
diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake
index 90b54b4079..d4da8199cc 100644
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@@ -138,4 +138,10 @@ endif()
 
 if (onnxruntime_WINML_NAMESPACE_OVERRIDE STREQUAL "Windows")
   target_compile_definitions(onnxruntime_common PRIVATE "BUILD_INBOX=1")
-endif()
\ No newline at end of file
+endif()
+
+# check if we need to link against libatomic due to std::atomic usage by the threadpool code
+# e.g. Raspberry Pi requires this
+if (onnxruntime_LINK_LIBATOMIC)
+  list(APPEND onnxruntime_EXTERNAL_LIBRARIES atomic)
+endif()
diff --git a/dockerfiles/Dockerfile.arm32v7 b/dockerfiles/Dockerfile.arm32v7
index f8cb5902df..bcc7fc55f7 100644
--- a/dockerfiles/Dockerfile.arm32v7
+++ b/dockerfiles/Dockerfile.arm32v7
@@ -1,9 +1,11 @@
+# Import info for 32-bit Qemu based build
+# There are also raspberry pi 4 and 64-bit images available so adjust as required
 FROM balenalib/raspberrypi3-python:latest-stretch-build
 
 ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
 ARG ONNXRUNTIME_SERVER_BRANCH=master
 
-#Enforces cross-compilation through Quemu
+# Enforces cross-compilation through Qemu.
 RUN [ "cross-build-start" ]
 
 RUN install_packages \
@@ -14,44 +16,41 @@ RUN install_packages \
     libssl-dev \
     wget \
     python3 \
-    python3-pip \
     python3-dev \
     git \
     tar \
     libatlas-base-dev
 
-RUN pip3 install --upgrade pip
+# Carefully install the latest version of pip 
+WORKDIR /pip
+RUN wget https://bootstrap.pypa.io/get-pip.py
+RUN python3 get-pip.py
 RUN pip3 install --upgrade setuptools
 RUN pip3 install --upgrade wheel
 RUN pip3 install numpy
 
 # Build the latest cmake
 WORKDIR /code
-RUN wget https://github.com/Kitware/CMake/releases/download/v3.14.3/cmake-3.14.3.tar.gz
-RUN tar zxf cmake-3.14.3.tar.gz
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.18.3/cmake-3.18.3.tar.gz
+RUN tar zxf cmake-3.18.3.tar.gz 
 
-WORKDIR /code/cmake-3.14.3
+WORKDIR /code/cmake-3.18.3
 RUN ./configure --system-curl
 RUN make
 RUN sudo make install
 
 # Set up build args
 ARG BUILDTYPE=MinSizeRel
+# if doing a 64-bit build change '--arm' to '--arm64'
 ARG BUILDARGS="--config ${BUILDTYPE} --arm"
 
 # Prepare onnxruntime Repo
 WORKDIR /code
 RUN git clone --single-branch --branch ${ONNXRUNTIME_SERVER_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime
 
-# Start the basic build
+# Build ORT including the shared lib and python bindings
 WORKDIR /code/onnxruntime
-RUN ./build.sh --use_openmp ${BUILDARGS} --update --build
-
-# Build Shared Library
-RUN ./build.sh --use_openmp ${BUILDARGS} --build_shared_lib
-
-# Build Python Bindings and Wheel
-RUN ./build.sh --use_openmp ${BUILDARGS} --enable_pybind --build_wheel
+RUN ./build.sh --use_openmp ${BUILDARGS} --update --build --build_shared_lib --build_wheel
 
 # Build Output
 RUN ls -l /code/onnxruntime/build/Linux/${BUILDTYPE}/*.so
diff --git a/include/onnxruntime/core/framework/tensor.h b/include/onnxruntime/core/framework/tensor.h
index e6679f32c7..cdf1d24b23 100644
--- a/include/onnxruntime/core/framework/tensor.h
+++ b/include/onnxruntime/core/framework/tensor.h
@@ -61,11 +61,12 @@ class Tensor final {
   Tensor() = default;  // to allow creating vector<Tensor> to support seq(tensor)
 
   /**
-   * Create tensor with given type, shape, pre-allocate memory and allocator info.
+   * Create tensor with given type, shape, pre-allocated memory and allocator info.
    * This function won't check if the preallocated buffer(p_data) has enough room for the shape.
    * \param data A preallocated buffer. Can be NULL if the shape is empty.
    *              Tensor does not own the data and will not delete it
    * \param alloc Where the buffer('data') was allocated from
+   * \param offset Offset in bytes to start of Tensor within p_data. 
    */
   Tensor(MLDataType p_type, const TensorShape& shape, void* p_data, const OrtMemoryInfo& alloc,
          ptrdiff_t offset = 0);
@@ -74,7 +75,7 @@ class Tensor final {
    * Deprecated. The orginal design is this Tensor class won't do any allocation / release.
    * However, this function will allocate the buffer for the shape, and do placement new if p_type is string tensor.
    */
-  Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator, ptrdiff_t offset = 0);
+  Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator);
 
   ~Tensor();
 
diff --git a/onnxruntime/core/framework/sparse_tensor.cc b/onnxruntime/core/framework/sparse_tensor.cc
index 4542be551d..0d07dc1799 100644
--- a/onnxruntime/core/framework/sparse_tensor.cc
+++ b/onnxruntime/core/framework/sparse_tensor.cc
@@ -16,7 +16,7 @@ SparseTensor::SparseTensor(MLDataType elt_type,
                            void* values_data,
                            void* indices_data,
                            const OrtMemoryInfo& memory_info)
-    : values_(elt_type, TensorShape({static_cast<int64_t>(nnz)}), values_data, memory_info, 0),
+    : values_(elt_type, TensorShape({static_cast<int64_t>(nnz)}), values_data, memory_info),
       indices_(DataTypeImpl::GetType<int64_t>(),
                TensorShape({static_cast<int64_t>(nnz), static_cast<int64_t>(shape.NumDimensions())}),
                indices_data, memory_info, 0),
@@ -26,10 +26,10 @@ SparseTensor::SparseTensor(MLDataType elt_type,
                            const TensorShape& shape,
                            size_t nnz,
                            std::shared_ptr<IAllocator> allocator)
-    : values_(elt_type, TensorShape({static_cast<int64_t>(nnz)}), allocator, 0),
+    : values_(elt_type, TensorShape({static_cast<int64_t>(nnz)}), allocator),
       indices_(DataTypeImpl::GetType<int64_t>(),
                TensorShape({static_cast<int64_t>(nnz), static_cast<int64_t>(shape.NumDimensions())}),
-               allocator, 0),
+               allocator),
       shape_(shape) {}
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/tensor.cc b/onnxruntime/core/framework/tensor.cc
index e9923de7d7..5276830169 100644
--- a/onnxruntime/core/framework/tensor.cc
+++ b/onnxruntime/core/framework/tensor.cc
@@ -17,7 +17,7 @@ Tensor::Tensor(MLDataType p_type, const TensorShape& shape, void* p_data, const
   Init(p_type, shape, p_data, nullptr, offset);
 }
 
-Tensor::Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator, ptrdiff_t offset)
+Tensor::Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator)
     : alloc_info_(allocator->Info()) {
   ORT_ENFORCE(p_type != nullptr);
   int64_t shape_size = shape.Size();  // value returned is checked for overflow by TensorShape::Size()
@@ -30,13 +30,10 @@ Tensor::Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAll
     if (!allocator->CalcMemSizeForArray(SafeInt<size_t>(shape_size), p_type->Size(), &len))
       ORT_THROW("tensor failed memory size calculation");
 
-    // TODO: Use case for this isn't clear. We allocate a buffer based on the tensor shape and increase it by offset.
-    // Who is going to use the memory prior to offset, and/or why should it be allocated here?
-    len += offset;
     p_data = allocator->Alloc(len);
   }
 
-  Init(p_type, shape, p_data, allocator, offset);
+  Init(p_type, shape, p_data, allocator);
 }
 
 size_t Tensor::SizeInBytes() const {
diff --git a/onnxruntime/test/framework/tensor_test.cc b/onnxruntime/test/framework/tensor_test.cc
index d3ca24b88c..6066e6cb8b 100644
--- a/onnxruntime/test/framework/tensor_test.cc
+++ b/onnxruntime/test/framework/tensor_test.cc
@@ -13,42 +13,43 @@
 namespace onnxruntime {
 namespace test {
 template <typename T>
-void CPUTensorTest(std::vector<int64_t> dims, const int offset = 0) {
-  //not own the buffer
-  TensorShape shape(dims);
+void CPUTensorTest(std::vector<int64_t> dims, const int offset_elements = 0) {
+  // create Tensor where we provide the buffer
+  TensorShape shape(dims);  // this is the shape that will be available starting at the offset in the Tensor
   auto alloc = TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault);
-  auto data = alloc->Alloc(sizeof(T) * (shape.Size() + offset));
-  EXPECT_TRUE(data);
-  Tensor t(DataTypeImpl::GetType<T>(), shape, data, alloc->Info(), offset);
+  // alloc extra data if needed, as anything before the offset is not covered by the shape
+  auto num_elements = shape.Size() + offset_elements;
+  auto num_bytes = num_elements * sizeof(T);
+  auto offset_bytes = offset_elements * sizeof(T);
+  void* data = alloc->Alloc(num_bytes);
+  const T* first_element = static_cast<const T*>(data) + offset_elements;
+
+  Tensor t(DataTypeImpl::GetType<T>(), shape, data, alloc->Info(), offset_bytes);
   auto tensor_shape = t.Shape();
-  //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213
-  EXPECT_EQ(*reinterpret_cast<const std::vector<int64_t>*>(&shape), *reinterpret_cast<const std::vector<int64_t>*>(&tensor_shape));
+  EXPECT_EQ(shape.GetDims(), tensor_shape.GetDims());
   EXPECT_EQ(t.DataType(), DataTypeImpl::GetType<T>());
   auto& location = t.Location();
   EXPECT_STREQ(location.name, CPU);
   EXPECT_EQ(location.id, 0);
 
-  auto t_data = t.template MutableData<T>();
-  EXPECT_TRUE(t_data);
-  memset(t_data, 0, sizeof(T) * shape.Size());
-  EXPECT_EQ(*(T*)((char*)data + offset), (T)0);
+  const T* t_data = t.Data<T>();
+  EXPECT_EQ(first_element, t_data);
   alloc->Free(data);
 
-  Tensor new_t(DataTypeImpl::GetType<T>(), shape, alloc, offset);
+  // test when the Tensor allocates the buffer.
+  // there's no point using an offset_elements here as you'd be allocating extra data prior to the buffer needed
+  // by the Tensor instance.
+  if (offset_elements == 0) {
+    Tensor new_t(DataTypeImpl::GetType<T>(), shape, alloc);
+    EXPECT_TRUE(new_t.OwnsBuffer());
 
-  tensor_shape = new_t.Shape();
-  //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213
-  EXPECT_EQ(*reinterpret_cast<const std::vector<int64_t>*>(&shape), *reinterpret_cast<const std::vector<int64_t>*>(&tensor_shape));
-  EXPECT_EQ(new_t.DataType(), DataTypeImpl::GetType<T>());
-  auto& new_location = new_t.Location();
-  ASSERT_STREQ(new_location.name, CPU);
-  EXPECT_EQ(new_location.id, 0);
-
-  auto new_data = new_t.template MutableData<T>();
-  EXPECT_TRUE(new_data);
-  memset(new_data, 0, sizeof(T) * shape.Size());
-  EXPECT_EQ(*(T*)((char*)new_data + offset), (T)0);
-  //no free op as the tensor own the buffer
+    tensor_shape = new_t.Shape();
+    EXPECT_EQ(shape.GetDims(), tensor_shape.GetDims());
+    EXPECT_EQ(new_t.DataType(), DataTypeImpl::GetType<T>());
+    auto& new_location = new_t.Location();
+    ASSERT_STREQ(new_location.name, CPU);
+    EXPECT_EQ(new_location.id, 0);
+  }
 }
 
 TEST(TensorTest, CPUFloatTensorTest) {
@@ -208,11 +209,6 @@ TEST(TensorTest, SizeOverflow) {
 
   Tensor t(type, shape1, nullptr, alloc->Info());
   EXPECT_THROW(t.SizeInBytes(), OnnxRuntimeException);
-
-  // overflow due to offset. max/4 from shape, *4 from float size, + 4 from offset
-  TensorShape shape2({static_cast<int64_t>(std::numeric_limits<size_t>::max() / 4)});
-  ptrdiff_t offset = sizeof(float);  // one more element to push past max
-  EXPECT_THROW(Tensor(type, shape2, alloc, offset), OnnxRuntimeException);
 }
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
index 1921e4335c..1e8a26b18d 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
@@ -9,60 +9,60 @@ namespace test {
 
 TEST_F(ActivationOpTest, Sigmoid) {
   TestActivationOp("Sigmoid",
-    input_values,
-    [](float x) {
-      auto y = 1.f / (1.f + std::exp(-std::abs(x)));  // safe sigmoid
-      y = x > 0 ? y : 1 - y;
-      return y;
-    });
+                   input_values,
+                   [](float x) {
+                     auto y = 1.f / (1.f + std::exp(-std::abs(x)));  // safe sigmoid
+                     y = x > 0 ? y : 1 - y;
+                     return y;
+                   });
 }
 
 TEST_F(ActivationOpTest, HardSigmoid) {
   float alpha = 0.2f;
   float beta = 0.5f;
   TestActivationOp("HardSigmoid",
-    input_values,
-    [alpha, beta](float x) {
-      return std::max(std::min((alpha * x + beta), 1.0f), 0.0f);
-    },
-    {{"alpha", alpha}, {"beta", beta}});
+                   input_values,
+                   [alpha, beta](float x) {
+                     return std::max(std::min((alpha * x + beta), 1.0f), 0.0f);
+                   },
+                   {{"alpha", alpha}, {"beta", beta}});
 }
 
 TEST_F(ActivationOpTest, Tanh) {
   TestActivationOp("Tanh",
-    input_values,
-    [](float x) { return std::tanh(x); });
+                   input_values,
+                   [](float x) { return std::tanh(x); });
 }
 
 TEST_F(ActivationOpTest, Relu) {
   TestActivationOp("Relu",
-    input_values,
-    [](float x) { return std::max(x, 0.0f); });
+                   input_values,
+                   [](float x) { return std::max(x, 0.0f); });
 }
 
 TEST_F(ActivationOpTest, Elu) {
   float alpha = 0.1f;
   TestActivationOp("Elu",
-    input_values,
-    [alpha](float x) { return (x >= 0) ? x : alpha * (exp(x) - 1); },
-    {{"alpha", alpha}});
+                   input_values,
+                   [alpha](float x) { return (x >= 0) ? x : alpha * (exp(x) - 1); },
+                   {{"alpha", alpha}});
 }
 
 TEST_F(ActivationOpTest, LeakyRelu) {
   float alpha = 0.1f;
   TestActivationOp("LeakyRelu",
-    input_values,
-    [alpha](float x) { return (x >= 0) ? x : alpha * x; },
-    {{"alpha", alpha}});
+                   input_values,
+                   [alpha](float x) { return (x >= 0) ? x : alpha * x; },
+                   {{"alpha", alpha}});
 }
 
 TEST_F(ActivationOpTest, ThresholdedRelu) {
   float alpha = 0.1f;
   TestActivationOp(
-    "ThresholdedRelu",
-    input_values,
-    [alpha](float x) { return (x >= alpha) ? x : 0; },
-    {{"alpha", alpha}}, true, 10);
+      "ThresholdedRelu",
+      input_values,
+      [alpha](float x) { return (x >= alpha) ? x : 0; },
+      {{"alpha", alpha}}, true, 10);
 }
 
 TEST_F(ActivationOpTest, Selu) {
@@ -70,9 +70,9 @@ TEST_F(ActivationOpTest, Selu) {
   static constexpr float gamma = 1.0507f;
 
   TestActivationOp("Selu",
-    input_values,
-    [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
-    {{"alpha", alpha}, {"gamma", gamma}});
+                   input_values,
+                   [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
+                   {{"alpha", alpha}, {"gamma", gamma}});
 }
 
 TEST_F(ActivationOpTest, Selu_Attributes) {
@@ -80,9 +80,9 @@ TEST_F(ActivationOpTest, Selu_Attributes) {
   static constexpr float gamma = 0.5f;
 
   TestActivationOp("Selu",
-    input_values,
-    [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
-    {{"alpha", alpha}, {"gamma", gamma}});
+                   input_values,
+                   [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
+                   {{"alpha", alpha}, {"gamma", gamma}});
 }
 
 TEST_F(ActivationOpTest, PRelu) {
@@ -145,20 +145,46 @@ TEST_F(ActivationOpTest, PRelu_MultiChannel) {
 
 TEST_F(ActivationOpTest, Softplus) {
   TestActivationOp("Softplus",
-    input_values,
-    [](float x) {
-      if (x > 0)
-        return x + logf(expf(-x) + 1);
-      else
-        return logf(expf(x) + 1);
-    });
+                   input_values,
+                   [](float x) {
+                     if (x > 0)
+                       return x + logf(expf(-x) + 1);
+                     else
+                       return logf(expf(x) + 1);
+                   });
 }
 
 TEST_F(ActivationOpNoInfTest, Softsign) {
   TestActivationOp(
-    "Softsign",
-    input_values,
-    [](float x) { return x / (1 + std::abs(x)); }, {}, false);  // Disable TensorRT because result mismatches
+      "Softsign",
+      input_values,
+      [](float x) {
+        auto result = x / (1 + std::abs(x));
+
+#if defined(__arm__)
+        // Softsign uses Eigen inverse(), which on ARM32 results in a different value when x is FLT_MAX or -FLT_MAX
+        // 3.40282347e+38 -> 0 with ARM32 inverse() vs something like 2.939e-39#DEN with other platforms.
+        //
+        // Possibly explained by https://en.wikipedia.org/wiki/ARM_architecture#Advanced_SIMD_(Neon)
+        // 'A quirk of Neon in Armv7 devices is that it flushes all subnormal numbers to zero'
+        //
+        // c.f.
+        // cmake\external\eigen\Eigen\src\Core\arch\SSE\PacketMath.h uses _mm_div_ps for 'pdiv<Packet4f>'
+        // cmake\external\eigen\Eigen\src\Core\arch\NEON\PacketMath.h uses a custom implementation for 'pdiv<Packet4f>'
+        //
+        // Special case the expected values to allow for that. If handling FLT_MAX more consistently is required
+        // we'd need to not use Eigen for Softsign on ARM32.
+        //
+        if (x == FLT_MAX) {
+          result = 0.;
+        } else if (x == -FLT_MAX) {
+          result = -0.;
+        }
+#endif
+
+        return result;
+      },
+      {}, false);  // Disable TensorRT because result mismatches
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index cf4985c9b1..602913d300 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -598,7 +598,7 @@ TEST(ReductionOpTest, ReduceMax_int32) {
 #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16) || defined(OPENVINO_CONFIG_MYRIAD)
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // OpenVINO: Disabled temporarily
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});                          //TensorRT: axis must be 0
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: axis must be 0
 #endif
 }
 
@@ -619,7 +619,7 @@ TEST(ReductionOpTest, ReduceMax_int64) {
 #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16) || defined(OPENVINO_CONFIG_MYRIAD)
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // OpenVINO: Disabled temporarily
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});                          //TensorRT: axis must be 0
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: axis must be 0
 #endif
 }
 
@@ -640,7 +640,7 @@ TEST(ReductionOpTest, ReduceMax_int8) {
 #if defined(OPENVINO_CONFIG_MYRIAD)
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // OpenVINO: Disabled temporarily
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});                          //TensorRT: axis must be 0
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: axis must be 0
 #endif
 }
 
@@ -661,7 +661,7 @@ TEST(ReductionOpTest, ReduceMax_uint8) {
 #if defined(OPENVINO_CONFIG_MYRIAD)
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // OpenVINO: Disabled temporarily
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});                          //TensorRT: axis must be 0
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: axis must be 0
 #endif
 }
 
@@ -720,6 +720,14 @@ TEST(ReductionOpTest, ReduceMean_do_not_keepdims) {
                         55.0f, 1.0f,
                         60.0f, 2.0f});
   test.AddOutput<float>("reduced", {3, 2}, {12.5f, 1.5f, 35.0f, 1.5f, 57.5f, 1.5f});
+
+#if defined(__arm__)
+  // armv7 isn't as accurate so need to add a little tolerance for the diffs
+  //  expected[i] evaluates to 35,
+  //  output[i] evaluates to 34.999866485595703
+  test.SetOutputRelErr("reduced", 1e-5f);
+#endif
+
   test.Run();
 }
 
@@ -747,6 +755,14 @@ TEST(ReductionOpTest, ReduceMean_keepdims) {
                         55.0f, 1.0f,
                         60.0f, 2.0f});
   test.AddOutput<float>("reduced", {3, 1, 2}, {12.5f, 1.5f, 35.0f, 1.5f, 57.5f, 1.5f});
+
+#if defined(__arm__)
+  // armv7 isn't as accurate so need to add a little tolerance for the diffs
+  //  expected[i] evaluates to 35,
+  //  output[i] evaluates to 34.999866485595703
+  test.SetOutputRelErr("reduced", 1e-5f);
+#endif
+
   test.Run();
 }
 
@@ -764,6 +780,7 @@ TEST(ReductionOpTest, ReduceMean) {
                         9.0f, 10.0f,
                         11.0f, 12.0f});
   test.AddOutput<float>("reduced", {1, 2, 1}, {5.5f, 7.5f});
+
   test.Run();
 }