From a92ccbe1bc015d0b679b08f0c623d4be164ff7b3 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Fri, 9 Oct 2020 22:34:32 +1000 Subject: [PATCH] Various armv7 related fixes (#5394) * - Link with libatomic if needed - Install pip differently so it doesn't clash with the system pip which may involve a wrapper script - Remove ability to specify offset when Tensor allocates the data. The data prior to offset isn't accessible by anything. - Fix use of offset in TensorOpTest to work on armv7 where it must be aligned to the type it points to. - Fix ActivationOpNoInfTest.Softsign to allow for armv7 behavior - Fix ReductionOpTest.ReduceMean_*keepdims to allow for armv7 floating point inaccuracy * Address PR comments --- cmake/CMakeLists.txt | 20 ++++ cmake/onnxruntime_common.cmake | 8 +- dockerfiles/Dockerfile.arm32v7 | 27 +++-- include/onnxruntime/core/framework/tensor.h | 5 +- onnxruntime/core/framework/sparse_tensor.cc | 6 +- onnxruntime/core/framework/tensor.cc | 7 +- onnxruntime/test/framework/tensor_test.cc | 58 +++++----- .../cpu/activation/activation_op_test.cc | 108 +++++++++++------- .../cpu/reduction/reduction_ops_test.cc | 25 +++- 9 files changed, 163 insertions(+), 101 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 8aee11ffaf..6f25b2f4d2 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -305,6 +305,26 @@ if(onnxruntime_DISABLE_EXCEPTIONS) endif() endif() +# We need to link with libatomic on systems that do not have built-in atomics, or +# don't have built-in support for 8 byte atomics +# Derived from https://github.com/protocolbuffers/protobuf/blob/master/cmake/CMakeLists.txt +set(onnxruntime_LINK_LIBATOMIC false) +if (NOT MSVC) + include(CheckCXXSourceCompiles) + set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS}) + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} -std=c++11) + check_cxx_source_compiles(" + #include + int main() { + return std::atomic{}; + } + " onnxruntime_HAVE_BUILTIN_ATOMICS) + if (NOT onnxruntime_HAVE_BUILTIN_ATOMICS) + set(onnxruntime_LINK_LIBATOMIC true) + endif () + set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS}) +endif () + set(REPO_ROOT ${PROJECT_SOURCE_DIR}/..) set(ONNXRUNTIME_ROOT ${PROJECT_SOURCE_DIR}/../onnxruntime) set(ORTTRAINING_ROOT ${PROJECT_SOURCE_DIR}/../orttraining) diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake index 90b54b4079..d4da8199cc 100644 --- a/cmake/onnxruntime_common.cmake +++ b/cmake/onnxruntime_common.cmake @@ -138,4 +138,10 @@ endif() if (onnxruntime_WINML_NAMESPACE_OVERRIDE STREQUAL "Windows") target_compile_definitions(onnxruntime_common PRIVATE "BUILD_INBOX=1") -endif() \ No newline at end of file +endif() + +# check if we need to link against libatomic due to std::atomic usage by the threadpool code +# e.g. Raspberry Pi requires this +if (onnxruntime_LINK_LIBATOMIC) + list(APPEND onnxruntime_EXTERNAL_LIBRARIES atomic) +endif() diff --git a/dockerfiles/Dockerfile.arm32v7 b/dockerfiles/Dockerfile.arm32v7 index f8cb5902df..bcc7fc55f7 100644 --- a/dockerfiles/Dockerfile.arm32v7 +++ b/dockerfiles/Dockerfile.arm32v7 @@ -1,9 +1,11 @@ +# Import info for 32-bit Qemu based build +# There are also raspberry pi 4 and 64-bit images available so adjust as required FROM balenalib/raspberrypi3-python:latest-stretch-build ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime ARG ONNXRUNTIME_SERVER_BRANCH=master -#Enforces cross-compilation through Quemu +# Enforces cross-compilation through Qemu. RUN [ "cross-build-start" ] RUN install_packages \ @@ -14,44 +16,41 @@ RUN install_packages \ libssl-dev \ wget \ python3 \ - python3-pip \ python3-dev \ git \ tar \ libatlas-base-dev -RUN pip3 install --upgrade pip +# Carefully install the latest version of pip +WORKDIR /pip +RUN wget https://bootstrap.pypa.io/get-pip.py +RUN python3 get-pip.py RUN pip3 install --upgrade setuptools RUN pip3 install --upgrade wheel RUN pip3 install numpy # Build the latest cmake WORKDIR /code -RUN wget https://github.com/Kitware/CMake/releases/download/v3.14.3/cmake-3.14.3.tar.gz -RUN tar zxf cmake-3.14.3.tar.gz +RUN wget https://github.com/Kitware/CMake/releases/download/v3.18.3/cmake-3.18.3.tar.gz +RUN tar zxf cmake-3.18.3.tar.gz -WORKDIR /code/cmake-3.14.3 +WORKDIR /code/cmake-3.18.3 RUN ./configure --system-curl RUN make RUN sudo make install # Set up build args ARG BUILDTYPE=MinSizeRel +# if doing a 64-bit build change '--arm' to '--arm64' ARG BUILDARGS="--config ${BUILDTYPE} --arm" # Prepare onnxruntime Repo WORKDIR /code RUN git clone --single-branch --branch ${ONNXRUNTIME_SERVER_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime -# Start the basic build +# Build ORT including the shared lib and python bindings WORKDIR /code/onnxruntime -RUN ./build.sh --use_openmp ${BUILDARGS} --update --build - -# Build Shared Library -RUN ./build.sh --use_openmp ${BUILDARGS} --build_shared_lib - -# Build Python Bindings and Wheel -RUN ./build.sh --use_openmp ${BUILDARGS} --enable_pybind --build_wheel +RUN ./build.sh --use_openmp ${BUILDARGS} --update --build --build_shared_lib --build_wheel # Build Output RUN ls -l /code/onnxruntime/build/Linux/${BUILDTYPE}/*.so diff --git a/include/onnxruntime/core/framework/tensor.h b/include/onnxruntime/core/framework/tensor.h index e6679f32c7..cdf1d24b23 100644 --- a/include/onnxruntime/core/framework/tensor.h +++ b/include/onnxruntime/core/framework/tensor.h @@ -61,11 +61,12 @@ class Tensor final { Tensor() = default; // to allow creating vector to support seq(tensor) /** - * Create tensor with given type, shape, pre-allocate memory and allocator info. + * Create tensor with given type, shape, pre-allocated memory and allocator info. * This function won't check if the preallocated buffer(p_data) has enough room for the shape. * \param data A preallocated buffer. Can be NULL if the shape is empty. * Tensor does not own the data and will not delete it * \param alloc Where the buffer('data') was allocated from + * \param offset Offset in bytes to start of Tensor within p_data. */ Tensor(MLDataType p_type, const TensorShape& shape, void* p_data, const OrtMemoryInfo& alloc, ptrdiff_t offset = 0); @@ -74,7 +75,7 @@ class Tensor final { * Deprecated. The orginal design is this Tensor class won't do any allocation / release. * However, this function will allocate the buffer for the shape, and do placement new if p_type is string tensor. */ - Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr allocator, ptrdiff_t offset = 0); + Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr allocator); ~Tensor(); diff --git a/onnxruntime/core/framework/sparse_tensor.cc b/onnxruntime/core/framework/sparse_tensor.cc index 4542be551d..0d07dc1799 100644 --- a/onnxruntime/core/framework/sparse_tensor.cc +++ b/onnxruntime/core/framework/sparse_tensor.cc @@ -16,7 +16,7 @@ SparseTensor::SparseTensor(MLDataType elt_type, void* values_data, void* indices_data, const OrtMemoryInfo& memory_info) - : values_(elt_type, TensorShape({static_cast(nnz)}), values_data, memory_info, 0), + : values_(elt_type, TensorShape({static_cast(nnz)}), values_data, memory_info), indices_(DataTypeImpl::GetType(), TensorShape({static_cast(nnz), static_cast(shape.NumDimensions())}), indices_data, memory_info, 0), @@ -26,10 +26,10 @@ SparseTensor::SparseTensor(MLDataType elt_type, const TensorShape& shape, size_t nnz, std::shared_ptr allocator) - : values_(elt_type, TensorShape({static_cast(nnz)}), allocator, 0), + : values_(elt_type, TensorShape({static_cast(nnz)}), allocator), indices_(DataTypeImpl::GetType(), TensorShape({static_cast(nnz), static_cast(shape.NumDimensions())}), - allocator, 0), + allocator), shape_(shape) {} } // namespace onnxruntime diff --git a/onnxruntime/core/framework/tensor.cc b/onnxruntime/core/framework/tensor.cc index e9923de7d7..5276830169 100644 --- a/onnxruntime/core/framework/tensor.cc +++ b/onnxruntime/core/framework/tensor.cc @@ -17,7 +17,7 @@ Tensor::Tensor(MLDataType p_type, const TensorShape& shape, void* p_data, const Init(p_type, shape, p_data, nullptr, offset); } -Tensor::Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr allocator, ptrdiff_t offset) +Tensor::Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr allocator) : alloc_info_(allocator->Info()) { ORT_ENFORCE(p_type != nullptr); int64_t shape_size = shape.Size(); // value returned is checked for overflow by TensorShape::Size() @@ -30,13 +30,10 @@ Tensor::Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptrCalcMemSizeForArray(SafeInt(shape_size), p_type->Size(), &len)) ORT_THROW("tensor failed memory size calculation"); - // TODO: Use case for this isn't clear. We allocate a buffer based on the tensor shape and increase it by offset. - // Who is going to use the memory prior to offset, and/or why should it be allocated here? - len += offset; p_data = allocator->Alloc(len); } - Init(p_type, shape, p_data, allocator, offset); + Init(p_type, shape, p_data, allocator); } size_t Tensor::SizeInBytes() const { diff --git a/onnxruntime/test/framework/tensor_test.cc b/onnxruntime/test/framework/tensor_test.cc index d3ca24b88c..6066e6cb8b 100644 --- a/onnxruntime/test/framework/tensor_test.cc +++ b/onnxruntime/test/framework/tensor_test.cc @@ -13,42 +13,43 @@ namespace onnxruntime { namespace test { template -void CPUTensorTest(std::vector dims, const int offset = 0) { - //not own the buffer - TensorShape shape(dims); +void CPUTensorTest(std::vector dims, const int offset_elements = 0) { + // create Tensor where we provide the buffer + TensorShape shape(dims); // this is the shape that will be available starting at the offset in the Tensor auto alloc = TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault); - auto data = alloc->Alloc(sizeof(T) * (shape.Size() + offset)); - EXPECT_TRUE(data); - Tensor t(DataTypeImpl::GetType(), shape, data, alloc->Info(), offset); + // alloc extra data if needed, as anything before the offset is not covered by the shape + auto num_elements = shape.Size() + offset_elements; + auto num_bytes = num_elements * sizeof(T); + auto offset_bytes = offset_elements * sizeof(T); + void* data = alloc->Alloc(num_bytes); + const T* first_element = static_cast(data) + offset_elements; + + Tensor t(DataTypeImpl::GetType(), shape, data, alloc->Info(), offset_bytes); auto tensor_shape = t.Shape(); - //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213 - EXPECT_EQ(*reinterpret_cast*>(&shape), *reinterpret_cast*>(&tensor_shape)); + EXPECT_EQ(shape.GetDims(), tensor_shape.GetDims()); EXPECT_EQ(t.DataType(), DataTypeImpl::GetType()); auto& location = t.Location(); EXPECT_STREQ(location.name, CPU); EXPECT_EQ(location.id, 0); - auto t_data = t.template MutableData(); - EXPECT_TRUE(t_data); - memset(t_data, 0, sizeof(T) * shape.Size()); - EXPECT_EQ(*(T*)((char*)data + offset), (T)0); + const T* t_data = t.Data(); + EXPECT_EQ(first_element, t_data); alloc->Free(data); - Tensor new_t(DataTypeImpl::GetType(), shape, alloc, offset); + // test when the Tensor allocates the buffer. + // there's no point using an offset_elements here as you'd be allocating extra data prior to the buffer needed + // by the Tensor instance. + if (offset_elements == 0) { + Tensor new_t(DataTypeImpl::GetType(), shape, alloc); + EXPECT_TRUE(new_t.OwnsBuffer()); - tensor_shape = new_t.Shape(); - //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213 - EXPECT_EQ(*reinterpret_cast*>(&shape), *reinterpret_cast*>(&tensor_shape)); - EXPECT_EQ(new_t.DataType(), DataTypeImpl::GetType()); - auto& new_location = new_t.Location(); - ASSERT_STREQ(new_location.name, CPU); - EXPECT_EQ(new_location.id, 0); - - auto new_data = new_t.template MutableData(); - EXPECT_TRUE(new_data); - memset(new_data, 0, sizeof(T) * shape.Size()); - EXPECT_EQ(*(T*)((char*)new_data + offset), (T)0); - //no free op as the tensor own the buffer + tensor_shape = new_t.Shape(); + EXPECT_EQ(shape.GetDims(), tensor_shape.GetDims()); + EXPECT_EQ(new_t.DataType(), DataTypeImpl::GetType()); + auto& new_location = new_t.Location(); + ASSERT_STREQ(new_location.name, CPU); + EXPECT_EQ(new_location.id, 0); + } } TEST(TensorTest, CPUFloatTensorTest) { @@ -208,11 +209,6 @@ TEST(TensorTest, SizeOverflow) { Tensor t(type, shape1, nullptr, alloc->Info()); EXPECT_THROW(t.SizeInBytes(), OnnxRuntimeException); - - // overflow due to offset. max/4 from shape, *4 from float size, + 4 from offset - TensorShape shape2({static_cast(std::numeric_limits::max() / 4)}); - ptrdiff_t offset = sizeof(float); // one more element to push past max - EXPECT_THROW(Tensor(type, shape2, alloc, offset), OnnxRuntimeException); } } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc index 1921e4335c..1e8a26b18d 100644 --- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc +++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc @@ -9,60 +9,60 @@ namespace test { TEST_F(ActivationOpTest, Sigmoid) { TestActivationOp("Sigmoid", - input_values, - [](float x) { - auto y = 1.f / (1.f + std::exp(-std::abs(x))); // safe sigmoid - y = x > 0 ? y : 1 - y; - return y; - }); + input_values, + [](float x) { + auto y = 1.f / (1.f + std::exp(-std::abs(x))); // safe sigmoid + y = x > 0 ? y : 1 - y; + return y; + }); } TEST_F(ActivationOpTest, HardSigmoid) { float alpha = 0.2f; float beta = 0.5f; TestActivationOp("HardSigmoid", - input_values, - [alpha, beta](float x) { - return std::max(std::min((alpha * x + beta), 1.0f), 0.0f); - }, - {{"alpha", alpha}, {"beta", beta}}); + input_values, + [alpha, beta](float x) { + return std::max(std::min((alpha * x + beta), 1.0f), 0.0f); + }, + {{"alpha", alpha}, {"beta", beta}}); } TEST_F(ActivationOpTest, Tanh) { TestActivationOp("Tanh", - input_values, - [](float x) { return std::tanh(x); }); + input_values, + [](float x) { return std::tanh(x); }); } TEST_F(ActivationOpTest, Relu) { TestActivationOp("Relu", - input_values, - [](float x) { return std::max(x, 0.0f); }); + input_values, + [](float x) { return std::max(x, 0.0f); }); } TEST_F(ActivationOpTest, Elu) { float alpha = 0.1f; TestActivationOp("Elu", - input_values, - [alpha](float x) { return (x >= 0) ? x : alpha * (exp(x) - 1); }, - {{"alpha", alpha}}); + input_values, + [alpha](float x) { return (x >= 0) ? x : alpha * (exp(x) - 1); }, + {{"alpha", alpha}}); } TEST_F(ActivationOpTest, LeakyRelu) { float alpha = 0.1f; TestActivationOp("LeakyRelu", - input_values, - [alpha](float x) { return (x >= 0) ? x : alpha * x; }, - {{"alpha", alpha}}); + input_values, + [alpha](float x) { return (x >= 0) ? x : alpha * x; }, + {{"alpha", alpha}}); } TEST_F(ActivationOpTest, ThresholdedRelu) { float alpha = 0.1f; TestActivationOp( - "ThresholdedRelu", - input_values, - [alpha](float x) { return (x >= alpha) ? x : 0; }, - {{"alpha", alpha}}, true, 10); + "ThresholdedRelu", + input_values, + [alpha](float x) { return (x >= alpha) ? x : 0; }, + {{"alpha", alpha}}, true, 10); } TEST_F(ActivationOpTest, Selu) { @@ -70,9 +70,9 @@ TEST_F(ActivationOpTest, Selu) { static constexpr float gamma = 1.0507f; TestActivationOp("Selu", - input_values, - [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; }, - {{"alpha", alpha}, {"gamma", gamma}}); + input_values, + [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; }, + {{"alpha", alpha}, {"gamma", gamma}}); } TEST_F(ActivationOpTest, Selu_Attributes) { @@ -80,9 +80,9 @@ TEST_F(ActivationOpTest, Selu_Attributes) { static constexpr float gamma = 0.5f; TestActivationOp("Selu", - input_values, - [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; }, - {{"alpha", alpha}, {"gamma", gamma}}); + input_values, + [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; }, + {{"alpha", alpha}, {"gamma", gamma}}); } TEST_F(ActivationOpTest, PRelu) { @@ -145,20 +145,46 @@ TEST_F(ActivationOpTest, PRelu_MultiChannel) { TEST_F(ActivationOpTest, Softplus) { TestActivationOp("Softplus", - input_values, - [](float x) { - if (x > 0) - return x + logf(expf(-x) + 1); - else - return logf(expf(x) + 1); - }); + input_values, + [](float x) { + if (x > 0) + return x + logf(expf(-x) + 1); + else + return logf(expf(x) + 1); + }); } TEST_F(ActivationOpNoInfTest, Softsign) { TestActivationOp( - "Softsign", - input_values, - [](float x) { return x / (1 + std::abs(x)); }, {}, false); // Disable TensorRT because result mismatches + "Softsign", + input_values, + [](float x) { + auto result = x / (1 + std::abs(x)); + +#if defined(__arm__) + // Softsign uses Eigen inverse(), which on ARM32 results in a different value when x is FLT_MAX or -FLT_MAX + // 3.40282347e+38 -> 0 with ARM32 inverse() vs something like 2.939e-39#DEN with other platforms. + // + // Possibly explained by https://en.wikipedia.org/wiki/ARM_architecture#Advanced_SIMD_(Neon) + // 'A quirk of Neon in Armv7 devices is that it flushes all subnormal numbers to zero' + // + // c.f. + // cmake\external\eigen\Eigen\src\Core\arch\SSE\PacketMath.h uses _mm_div_ps for 'pdiv' + // cmake\external\eigen\Eigen\src\Core\arch\NEON\PacketMath.h uses a custom implementation for 'pdiv' + // + // Special case the expected values to allow for that. If handling FLT_MAX more consistently is required + // we'd need to not use Eigen for Softsign on ARM32. + // + if (x == FLT_MAX) { + result = 0.; + } else if (x == -FLT_MAX) { + result = -0.; + } +#endif + + return result; + }, + {}, false); // Disable TensorRT because result mismatches } } // namespace test diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index cf4985c9b1..602913d300 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -598,7 +598,7 @@ TEST(ReductionOpTest, ReduceMax_int32) { #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16) || defined(OPENVINO_CONFIG_MYRIAD) test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); // OpenVINO: Disabled temporarily #else - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: axis must be 0 + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: axis must be 0 #endif } @@ -619,7 +619,7 @@ TEST(ReductionOpTest, ReduceMax_int64) { #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16) || defined(OPENVINO_CONFIG_MYRIAD) test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); // OpenVINO: Disabled temporarily #else - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: axis must be 0 + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: axis must be 0 #endif } @@ -640,7 +640,7 @@ TEST(ReductionOpTest, ReduceMax_int8) { #if defined(OPENVINO_CONFIG_MYRIAD) test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); // OpenVINO: Disabled temporarily #else - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: axis must be 0 + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: axis must be 0 #endif } @@ -661,7 +661,7 @@ TEST(ReductionOpTest, ReduceMax_uint8) { #if defined(OPENVINO_CONFIG_MYRIAD) test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); // OpenVINO: Disabled temporarily #else - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: axis must be 0 + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: axis must be 0 #endif } @@ -720,6 +720,14 @@ TEST(ReductionOpTest, ReduceMean_do_not_keepdims) { 55.0f, 1.0f, 60.0f, 2.0f}); test.AddOutput("reduced", {3, 2}, {12.5f, 1.5f, 35.0f, 1.5f, 57.5f, 1.5f}); + +#if defined(__arm__) + // armv7 isn't as accurate so need to add a little tolerance for the diffs + // expected[i] evaluates to 35, + // output[i] evaluates to 34.999866485595703 + test.SetOutputRelErr("reduced", 1e-5f); +#endif + test.Run(); } @@ -747,6 +755,14 @@ TEST(ReductionOpTest, ReduceMean_keepdims) { 55.0f, 1.0f, 60.0f, 2.0f}); test.AddOutput("reduced", {3, 1, 2}, {12.5f, 1.5f, 35.0f, 1.5f, 57.5f, 1.5f}); + +#if defined(__arm__) + // armv7 isn't as accurate so need to add a little tolerance for the diffs + // expected[i] evaluates to 35, + // output[i] evaluates to 34.999866485595703 + test.SetOutputRelErr("reduced", 1e-5f); +#endif + test.Run(); } @@ -764,6 +780,7 @@ TEST(ReductionOpTest, ReduceMean) { 9.0f, 10.0f, 11.0f, 12.0f}); test.AddOutput("reduced", {1, 2, 1}, {5.5f, 7.5f}); + test.Run(); }