Various armv7 related fixes (#5394)

* - Link with libatomic if needed - Install pip differently so it doesn't clash with the system pip which may involve a wrapper script - Remove ability to specify offset when Tensor allocates the data. The data prior to offset isn't accessible by anything. - Fix use of offset in TensorOpTest to work on armv7 where it must be aligned to the type it points to. - Fix ActivationOpNoInfTest.Softsign to allow for armv7 behavior - Fix ReductionOpTest.ReduceMean_*keepdims to allow for armv7 floating point inaccuracy * Address PR comments
2026-07-09 17:28:58 +00:00 · 2020-10-09 22:34:32 +10:00 · 2020-10-09 22:34:32 +10:00 · a92ccbe1bc
commit a92ccbe1bc
parent b99eaa99cd
9 changed files with 163 additions and 101 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -305,6 +305,26 @@ if(onnxruntime_DISABLE_EXCEPTIONS)
  endif()
 endif()

+# We need to link with libatomic on systems that do not have built-in atomics, or
+# don't have built-in support for 8 byte atomics
+# Derived from https://github.com/protocolbuffers/protobuf/blob/master/cmake/CMakeLists.txt
+set(onnxruntime_LINK_LIBATOMIC false)
+if (NOT MSVC)
+  include(CheckCXXSourceCompiles)
+  set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+  set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} -std=c++11)
+  check_cxx_source_compiles("
+    #include <atomic>
+    int main() {
+      return std::atomic<int64_t>{};
+    }
+  " onnxruntime_HAVE_BUILTIN_ATOMICS)
+  if (NOT onnxruntime_HAVE_BUILTIN_ATOMICS)
+    set(onnxruntime_LINK_LIBATOMIC true)
+  endif ()
+  set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
+endif ()
+
 set(REPO_ROOT ${PROJECT_SOURCE_DIR}/..)
 set(ONNXRUNTIME_ROOT ${PROJECT_SOURCE_DIR}/../onnxruntime)
 set(ORTTRAINING_ROOT ${PROJECT_SOURCE_DIR}/../orttraining)
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@ -138,4 +138,10 @@ endif()

 if (onnxruntime_WINML_NAMESPACE_OVERRIDE STREQUAL "Windows")
  target_compile_definitions(onnxruntime_common PRIVATE "BUILD_INBOX=1")
-endif()
+endif()
+
+# check if we need to link against libatomic due to std::atomic usage by the threadpool code
+# e.g. Raspberry Pi requires this
+if (onnxruntime_LINK_LIBATOMIC)
+  list(APPEND onnxruntime_EXTERNAL_LIBRARIES atomic)
+endif()
--- a/dockerfiles/Dockerfile.arm32v7
+++ b/dockerfiles/Dockerfile.arm32v7
@ -1,9 +1,11 @@
+# Import info for 32-bit Qemu based build
+# There are also raspberry pi 4 and 64-bit images available so adjust as required
 FROM balenalib/raspberrypi3-python:latest-stretch-build

 ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
 ARG ONNXRUNTIME_SERVER_BRANCH=master

-#Enforces cross-compilation through Quemu
+# Enforces cross-compilation through Qemu.
 RUN [ "cross-build-start" ]

 RUN install_packages \
@ -14,44 +16,41 @@ RUN install_packages \
    libssl-dev \
    wget \
    python3 \
-    python3-pip \
    python3-dev \
    git \
    tar \
    libatlas-base-dev

-RUN pip3 install --upgrade pip
+# Carefully install the latest version of pip 
+WORKDIR /pip
+RUN wget https://bootstrap.pypa.io/get-pip.py
+RUN python3 get-pip.py
 RUN pip3 install --upgrade setuptools
 RUN pip3 install --upgrade wheel
 RUN pip3 install numpy

 # Build the latest cmake
 WORKDIR /code
-RUN wget https://github.com/Kitware/CMake/releases/download/v3.14.3/cmake-3.14.3.tar.gz
-RUN tar zxf cmake-3.14.3.tar.gz
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.18.3/cmake-3.18.3.tar.gz
+RUN tar zxf cmake-3.18.3.tar.gz 

-WORKDIR /code/cmake-3.14.3
+WORKDIR /code/cmake-3.18.3
 RUN ./configure --system-curl
 RUN make
 RUN sudo make install

 # Set up build args
 ARG BUILDTYPE=MinSizeRel
+# if doing a 64-bit build change '--arm' to '--arm64'
 ARG BUILDARGS="--config ${BUILDTYPE} --arm"

 # Prepare onnxruntime Repo
 WORKDIR /code
 RUN git clone --single-branch --branch ${ONNXRUNTIME_SERVER_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime

-# Start the basic build
+# Build ORT including the shared lib and python bindings
 WORKDIR /code/onnxruntime
-RUN ./build.sh --use_openmp ${BUILDARGS} --update --build
-
-# Build Shared Library
-RUN ./build.sh --use_openmp ${BUILDARGS} --build_shared_lib
-
-# Build Python Bindings and Wheel
-RUN ./build.sh --use_openmp ${BUILDARGS} --enable_pybind --build_wheel
+RUN ./build.sh --use_openmp ${BUILDARGS} --update --build --build_shared_lib --build_wheel

 # Build Output
 RUN ls -l /code/onnxruntime/build/Linux/${BUILDTYPE}/*.so
--- a/include/onnxruntime/core/framework/tensor.h
+++ b/include/onnxruntime/core/framework/tensor.h
@ -61,11 +61,12 @@ class Tensor final {
  Tensor() = default;  // to allow creating vector<Tensor> to support seq(tensor)

  /**
-   * Create tensor with given type, shape, pre-allocate memory and allocator info.
+   * Create tensor with given type, shape, pre-allocated memory and allocator info.
   * This function won't check if the preallocated buffer(p_data) has enough room for the shape.
   * \param data A preallocated buffer. Can be NULL if the shape is empty.
   *              Tensor does not own the data and will not delete it
   * \param alloc Where the buffer('data') was allocated from
+   * \param offset Offset in bytes to start of Tensor within p_data. 
   */
  Tensor(MLDataType p_type, const TensorShape& shape, void* p_data, const OrtMemoryInfo& alloc,
         ptrdiff_t offset = 0);
@ -74,7 +75,7 @@ class Tensor final {
   * Deprecated. The orginal design is this Tensor class won't do any allocation / release.
   * However, this function will allocate the buffer for the shape, and do placement new if p_type is string tensor.
   */
-  Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator, ptrdiff_t offset = 0);
+  Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator);

  ~Tensor();

--- a/onnxruntime/core/framework/sparse_tensor.cc
+++ b/onnxruntime/core/framework/sparse_tensor.cc
@ -16,7 +16,7 @@ SparseTensor::SparseTensor(MLDataType elt_type,
                           void* values_data,
                           void* indices_data,
                           const OrtMemoryInfo& memory_info)
-    : values_(elt_type, TensorShape({static_cast<int64_t>(nnz)}), values_data, memory_info, 0),
+    : values_(elt_type, TensorShape({static_cast<int64_t>(nnz)}), values_data, memory_info),
      indices_(DataTypeImpl::GetType<int64_t>(),
               TensorShape({static_cast<int64_t>(nnz), static_cast<int64_t>(shape.NumDimensions())}),
               indices_data, memory_info, 0),
@ -26,10 +26,10 @@ SparseTensor::SparseTensor(MLDataType elt_type,
                           const TensorShape& shape,
                           size_t nnz,
                           std::shared_ptr<IAllocator> allocator)
-    : values_(elt_type, TensorShape({static_cast<int64_t>(nnz)}), allocator, 0),
+    : values_(elt_type, TensorShape({static_cast<int64_t>(nnz)}), allocator),
      indices_(DataTypeImpl::GetType<int64_t>(),
               TensorShape({static_cast<int64_t>(nnz), static_cast<int64_t>(shape.NumDimensions())}),
-               allocator, 0),
+               allocator),
      shape_(shape) {}

 }  // namespace onnxruntime
--- a/onnxruntime/core/framework/tensor.cc
+++ b/onnxruntime/core/framework/tensor.cc
@ -17,7 +17,7 @@ Tensor::Tensor(MLDataType p_type, const TensorShape& shape, void* p_data, const
  Init(p_type, shape, p_data, nullptr, offset);
 }

-Tensor::Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator, ptrdiff_t offset)
+Tensor::Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator)
    : alloc_info_(allocator->Info()) {
  ORT_ENFORCE(p_type != nullptr);
  int64_t shape_size = shape.Size();  // value returned is checked for overflow by TensorShape::Size()
@ -30,13 +30,10 @@ Tensor::Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAll
    if (!allocator->CalcMemSizeForArray(SafeInt<size_t>(shape_size), p_type->Size(), &len))
      ORT_THROW("tensor failed memory size calculation");

-    // TODO: Use case for this isn't clear. We allocate a buffer based on the tensor shape and increase it by offset.
-    // Who is going to use the memory prior to offset, and/or why should it be allocated here?
-    len += offset;
    p_data = allocator->Alloc(len);
  }

-  Init(p_type, shape, p_data, allocator, offset);
+  Init(p_type, shape, p_data, allocator);
 }

 size_t Tensor::SizeInBytes() const {
--- a/onnxruntime/test/framework/tensor_test.cc
+++ b/onnxruntime/test/framework/tensor_test.cc
@ -13,42 +13,43 @@
 namespace onnxruntime {
 namespace test {
 template <typename T>
-void CPUTensorTest(std::vector<int64_t> dims, const int offset = 0) {
-  //not own the buffer
-  TensorShape shape(dims);
+void CPUTensorTest(std::vector<int64_t> dims, const int offset_elements = 0) {
+  // create Tensor where we provide the buffer
+  TensorShape shape(dims);  // this is the shape that will be available starting at the offset in the Tensor
  auto alloc = TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault);
-  auto data = alloc->Alloc(sizeof(T) * (shape.Size() + offset));
-  EXPECT_TRUE(data);
-  Tensor t(DataTypeImpl::GetType<T>(), shape, data, alloc->Info(), offset);
+  // alloc extra data if needed, as anything before the offset is not covered by the shape
+  auto num_elements = shape.Size() + offset_elements;
+  auto num_bytes = num_elements * sizeof(T);
+  auto offset_bytes = offset_elements * sizeof(T);
+  void* data = alloc->Alloc(num_bytes);
+  const T* first_element = static_cast<const T*>(data) + offset_elements;
+
+  Tensor t(DataTypeImpl::GetType<T>(), shape, data, alloc->Info(), offset_bytes);
  auto tensor_shape = t.Shape();
-  //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213
-  EXPECT_EQ(*reinterpret_cast<const std::vector<int64_t>*>(&shape), *reinterpret_cast<const std::vector<int64_t>*>(&tensor_shape));
+  EXPECT_EQ(shape.GetDims(), tensor_shape.GetDims());
  EXPECT_EQ(t.DataType(), DataTypeImpl::GetType<T>());
  auto& location = t.Location();
  EXPECT_STREQ(location.name, CPU);
  EXPECT_EQ(location.id, 0);

-  auto t_data = t.template MutableData<T>();
-  EXPECT_TRUE(t_data);
-  memset(t_data, 0, sizeof(T) * shape.Size());
-  EXPECT_EQ(*(T*)((char*)data + offset), (T)0);
+  const T* t_data = t.Data<T>();
+  EXPECT_EQ(first_element, t_data);
  alloc->Free(data);

-  Tensor new_t(DataTypeImpl::GetType<T>(), shape, alloc, offset);
+  // test when the Tensor allocates the buffer.
+  // there's no point using an offset_elements here as you'd be allocating extra data prior to the buffer needed
+  // by the Tensor instance.
+  if (offset_elements == 0) {
+    Tensor new_t(DataTypeImpl::GetType<T>(), shape, alloc);
+    EXPECT_TRUE(new_t.OwnsBuffer());

-  tensor_shape = new_t.Shape();
-  //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213
-  EXPECT_EQ(*reinterpret_cast<const std::vector<int64_t>*>(&shape), *reinterpret_cast<const std::vector<int64_t>*>(&tensor_shape));
-  EXPECT_EQ(new_t.DataType(), DataTypeImpl::GetType<T>());
-  auto& new_location = new_t.Location();
-  ASSERT_STREQ(new_location.name, CPU);
-  EXPECT_EQ(new_location.id, 0);
-
-  auto new_data = new_t.template MutableData<T>();
-  EXPECT_TRUE(new_data);
-  memset(new_data, 0, sizeof(T) * shape.Size());
-  EXPECT_EQ(*(T*)((char*)new_data + offset), (T)0);
-  //no free op as the tensor own the buffer
+    tensor_shape = new_t.Shape();
+    EXPECT_EQ(shape.GetDims(), tensor_shape.GetDims());
+    EXPECT_EQ(new_t.DataType(), DataTypeImpl::GetType<T>());
+    auto& new_location = new_t.Location();
+    ASSERT_STREQ(new_location.name, CPU);
+    EXPECT_EQ(new_location.id, 0);
+  }
 }

 TEST(TensorTest, CPUFloatTensorTest) {
@ -208,11 +209,6 @@ TEST(TensorTest, SizeOverflow) {

  Tensor t(type, shape1, nullptr, alloc->Info());
  EXPECT_THROW(t.SizeInBytes(), OnnxRuntimeException);
-
-  // overflow due to offset. max/4 from shape, *4 from float size, + 4 from offset
-  TensorShape shape2({static_cast<int64_t>(std::numeric_limits<size_t>::max() / 4)});
-  ptrdiff_t offset = sizeof(float);  // one more element to push past max
-  EXPECT_THROW(Tensor(type, shape2, alloc, offset), OnnxRuntimeException);
 }
 }  // namespace test
 }  // namespace onnxruntime
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
@ -9,60 +9,60 @@ namespace test {

 TEST_F(ActivationOpTest, Sigmoid) {
  TestActivationOp("Sigmoid",
-    input_values,
-    [](float x) {
-      auto y = 1.f / (1.f + std::exp(-std::abs(x)));  // safe sigmoid
-      y = x > 0 ? y : 1 - y;
-      return y;
-    });
+                   input_values,
+                   [](float x) {
+                     auto y = 1.f / (1.f + std::exp(-std::abs(x)));  // safe sigmoid
+                     y = x > 0 ? y : 1 - y;
+                     return y;
+                   });
 }

 TEST_F(ActivationOpTest, HardSigmoid) {
  float alpha = 0.2f;
  float beta = 0.5f;
  TestActivationOp("HardSigmoid",
-    input_values,
-    [alpha, beta](float x) {
-      return std::max(std::min((alpha * x + beta), 1.0f), 0.0f);
-    },
-    {{"alpha", alpha}, {"beta", beta}});
+                   input_values,
+                   [alpha, beta](float x) {
+                     return std::max(std::min((alpha * x + beta), 1.0f), 0.0f);
+                   },
+                   {{"alpha", alpha}, {"beta", beta}});
 }

 TEST_F(ActivationOpTest, Tanh) {
  TestActivationOp("Tanh",
-    input_values,
-    [](float x) { return std::tanh(x); });
+                   input_values,
+                   [](float x) { return std::tanh(x); });
 }

 TEST_F(ActivationOpTest, Relu) {
  TestActivationOp("Relu",
-    input_values,
-    [](float x) { return std::max(x, 0.0f); });
+                   input_values,
+                   [](float x) { return std::max(x, 0.0f); });
 }

 TEST_F(ActivationOpTest, Elu) {
  float alpha = 0.1f;
  TestActivationOp("Elu",
-    input_values,
-    [alpha](float x) { return (x >= 0) ? x : alpha * (exp(x) - 1); },
-    {{"alpha", alpha}});
+                   input_values,
+                   [alpha](float x) { return (x >= 0) ? x : alpha * (exp(x) - 1); },
+                   {{"alpha", alpha}});
 }

 TEST_F(ActivationOpTest, LeakyRelu) {
  float alpha = 0.1f;
  TestActivationOp("LeakyRelu",
-    input_values,
-    [alpha](float x) { return (x >= 0) ? x : alpha * x; },
-    {{"alpha", alpha}});
+                   input_values,
+                   [alpha](float x) { return (x >= 0) ? x : alpha * x; },
+                   {{"alpha", alpha}});
 }

 TEST_F(ActivationOpTest, ThresholdedRelu) {
  float alpha = 0.1f;
  TestActivationOp(
-    "ThresholdedRelu",
-    input_values,
-    [alpha](float x) { return (x >= alpha) ? x : 0; },
-    {{"alpha", alpha}}, true, 10);
+      "ThresholdedRelu",
+      input_values,
+      [alpha](float x) { return (x >= alpha) ? x : 0; },
+      {{"alpha", alpha}}, true, 10);
 }

 TEST_F(ActivationOpTest, Selu) {
@ -70,9 +70,9 @@ TEST_F(ActivationOpTest, Selu) {
  static constexpr float gamma = 1.0507f;

  TestActivationOp("Selu",
-    input_values,
-    [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
-    {{"alpha", alpha}, {"gamma", gamma}});
+                   input_values,
+                   [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
+                   {{"alpha", alpha}, {"gamma", gamma}});
 }

 TEST_F(ActivationOpTest, Selu_Attributes) {
@ -80,9 +80,9 @@ TEST_F(ActivationOpTest, Selu_Attributes) {
  static constexpr float gamma = 0.5f;

  TestActivationOp("Selu",
-    input_values,
-    [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
-    {{"alpha", alpha}, {"gamma", gamma}});
+                   input_values,
+                   [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
+                   {{"alpha", alpha}, {"gamma", gamma}});
 }

 TEST_F(ActivationOpTest, PRelu) {
@ -145,20 +145,46 @@ TEST_F(ActivationOpTest, PRelu_MultiChannel) {

 TEST_F(ActivationOpTest, Softplus) {
  TestActivationOp("Softplus",
-    input_values,
-    [](float x) {
-      if (x > 0)
-        return x + logf(expf(-x) + 1);
-      else
-        return logf(expf(x) + 1);
-    });
+                   input_values,
+                   [](float x) {
+                     if (x > 0)
+                       return x + logf(expf(-x) + 1);
+                     else
+                       return logf(expf(x) + 1);
+                   });
 }

 TEST_F(ActivationOpNoInfTest, Softsign) {
  TestActivationOp(
-    "Softsign",
-    input_values,
-    [](float x) { return x / (1 + std::abs(x)); }, {}, false);  // Disable TensorRT because result mismatches
+      "Softsign",
+      input_values,
+      [](float x) {
+        auto result = x / (1 + std::abs(x));
+
+#if defined(__arm__)
+        // Softsign uses Eigen inverse(), which on ARM32 results in a different value when x is FLT_MAX or -FLT_MAX
+        // 3.40282347e+38 -> 0 with ARM32 inverse() vs something like 2.939e-39#DEN with other platforms.
+        //
+        // Possibly explained by https://en.wikipedia.org/wiki/ARM_architecture#Advanced_SIMD_(Neon)
+        // 'A quirk of Neon in Armv7 devices is that it flushes all subnormal numbers to zero'
+        //
+        // c.f.
+        // cmake\external\eigen\Eigen\src\Core\arch\SSE\PacketMath.h uses _mm_div_ps for 'pdiv<Packet4f>'
+        // cmake\external\eigen\Eigen\src\Core\arch\NEON\PacketMath.h uses a custom implementation for 'pdiv<Packet4f>'
+        //
+        // Special case the expected values to allow for that. If handling FLT_MAX more consistently is required
+        // we'd need to not use Eigen for Softsign on ARM32.
+        //
+        if (x == FLT_MAX) {
+          result = 0.;
+        } else if (x == -FLT_MAX) {
+          result = -0.;
+        }
+#endif
+
+        return result;
+      },
+      {}, false);  // Disable TensorRT because result mismatches
 }

 }  // namespace test
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@ -598,7 +598,7 @@ TEST(ReductionOpTest, ReduceMax_int32) {
 #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16) || defined(OPENVINO_CONFIG_MYRIAD)
  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // OpenVINO: Disabled temporarily
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});                          //TensorRT: axis must be 0
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: axis must be 0
 #endif
 }

@ -619,7 +619,7 @@ TEST(ReductionOpTest, ReduceMax_int64) {
 #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16) || defined(OPENVINO_CONFIG_MYRIAD)
  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // OpenVINO: Disabled temporarily
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});                          //TensorRT: axis must be 0
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: axis must be 0
 #endif
 }

@ -640,7 +640,7 @@ TEST(ReductionOpTest, ReduceMax_int8) {
 #if defined(OPENVINO_CONFIG_MYRIAD)
  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // OpenVINO: Disabled temporarily
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});                          //TensorRT: axis must be 0
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: axis must be 0
 #endif
 }

@ -661,7 +661,7 @@ TEST(ReductionOpTest, ReduceMax_uint8) {
 #if defined(OPENVINO_CONFIG_MYRIAD)
  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // OpenVINO: Disabled temporarily
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});                          //TensorRT: axis must be 0
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: axis must be 0
 #endif
 }

@ -720,6 +720,14 @@ TEST(ReductionOpTest, ReduceMean_do_not_keepdims) {
                        55.0f, 1.0f,
                        60.0f, 2.0f});
  test.AddOutput<float>("reduced", {3, 2}, {12.5f, 1.5f, 35.0f, 1.5f, 57.5f, 1.5f});
+
+#if defined(__arm__)
+  // armv7 isn't as accurate so need to add a little tolerance for the diffs
+  //  expected[i] evaluates to 35,
+  //  output[i] evaluates to 34.999866485595703
+  test.SetOutputRelErr("reduced", 1e-5f);
+#endif
+
  test.Run();
 }

@ -747,6 +755,14 @@ TEST(ReductionOpTest, ReduceMean_keepdims) {
                        55.0f, 1.0f,
                        60.0f, 2.0f});
  test.AddOutput<float>("reduced", {3, 1, 2}, {12.5f, 1.5f, 35.0f, 1.5f, 57.5f, 1.5f});
+
+#if defined(__arm__)
+  // armv7 isn't as accurate so need to add a little tolerance for the diffs
+  //  expected[i] evaluates to 35,
+  //  output[i] evaluates to 34.999866485595703
+  test.SetOutputRelErr("reduced", 1e-5f);
+#endif
+
  test.Run();
 }

@ -764,6 +780,7 @@ TEST(ReductionOpTest, ReduceMean) {
                        9.0f, 10.0f,
                        11.0f, 12.0f});
  test.AddOutput<float>("reduced", {1, 2, 1}, {5.5f, 7.5f});
+
  test.Run();
 }