Various armv7 related fixes (#5394)

* - Link with libatomic if needed
 - Install pip differently so it doesn't clash with the system pip which may involve a wrapper script
 - Remove ability to specify offset when Tensor allocates the data. The data prior to offset isn't accessible by anything.
 - Fix use of offset in TensorOpTest to work on armv7 where it must be aligned to the type it points to.
 - Fix ActivationOpNoInfTest.Softsign to allow for armv7 behavior
 - Fix ReductionOpTest.ReduceMean_*keepdims to allow for armv7 floating point inaccuracy

* Address PR comments
This commit is contained in:
Scott McKay 2020-10-09 22:34:32 +10:00 committed by GitHub
parent b99eaa99cd
commit a92ccbe1bc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 163 additions and 101 deletions

View file

@ -305,6 +305,26 @@ if(onnxruntime_DISABLE_EXCEPTIONS)
endif()
endif()
# We need to link with libatomic on systems that do not have built-in atomics, or
# don't have built-in support for 8 byte atomics
# Derived from https://github.com/protocolbuffers/protobuf/blob/master/cmake/CMakeLists.txt
set(onnxruntime_LINK_LIBATOMIC false)
if (NOT MSVC)
include(CheckCXXSourceCompiles)
set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} -std=c++11)
check_cxx_source_compiles("
#include <atomic>
int main() {
return std::atomic<int64_t>{};
}
" onnxruntime_HAVE_BUILTIN_ATOMICS)
if (NOT onnxruntime_HAVE_BUILTIN_ATOMICS)
set(onnxruntime_LINK_LIBATOMIC true)
endif ()
set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
endif ()
set(REPO_ROOT ${PROJECT_SOURCE_DIR}/..)
set(ONNXRUNTIME_ROOT ${PROJECT_SOURCE_DIR}/../onnxruntime)
set(ORTTRAINING_ROOT ${PROJECT_SOURCE_DIR}/../orttraining)

View file

@ -138,4 +138,10 @@ endif()
if (onnxruntime_WINML_NAMESPACE_OVERRIDE STREQUAL "Windows")
target_compile_definitions(onnxruntime_common PRIVATE "BUILD_INBOX=1")
endif()
endif()
# check if we need to link against libatomic due to std::atomic usage by the threadpool code
# e.g. Raspberry Pi requires this
if (onnxruntime_LINK_LIBATOMIC)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES atomic)
endif()

View file

@ -1,9 +1,11 @@
# Import info for 32-bit Qemu based build
# There are also raspberry pi 4 and 64-bit images available so adjust as required
FROM balenalib/raspberrypi3-python:latest-stretch-build
ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
ARG ONNXRUNTIME_SERVER_BRANCH=master
#Enforces cross-compilation through Quemu
# Enforces cross-compilation through Qemu.
RUN [ "cross-build-start" ]
RUN install_packages \
@ -14,44 +16,41 @@ RUN install_packages \
libssl-dev \
wget \
python3 \
python3-pip \
python3-dev \
git \
tar \
libatlas-base-dev
RUN pip3 install --upgrade pip
# Carefully install the latest version of pip
WORKDIR /pip
RUN wget https://bootstrap.pypa.io/get-pip.py
RUN python3 get-pip.py
RUN pip3 install --upgrade setuptools
RUN pip3 install --upgrade wheel
RUN pip3 install numpy
# Build the latest cmake
WORKDIR /code
RUN wget https://github.com/Kitware/CMake/releases/download/v3.14.3/cmake-3.14.3.tar.gz
RUN tar zxf cmake-3.14.3.tar.gz
RUN wget https://github.com/Kitware/CMake/releases/download/v3.18.3/cmake-3.18.3.tar.gz
RUN tar zxf cmake-3.18.3.tar.gz
WORKDIR /code/cmake-3.14.3
WORKDIR /code/cmake-3.18.3
RUN ./configure --system-curl
RUN make
RUN sudo make install
# Set up build args
ARG BUILDTYPE=MinSizeRel
# if doing a 64-bit build change '--arm' to '--arm64'
ARG BUILDARGS="--config ${BUILDTYPE} --arm"
# Prepare onnxruntime Repo
WORKDIR /code
RUN git clone --single-branch --branch ${ONNXRUNTIME_SERVER_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime
# Start the basic build
# Build ORT including the shared lib and python bindings
WORKDIR /code/onnxruntime
RUN ./build.sh --use_openmp ${BUILDARGS} --update --build
# Build Shared Library
RUN ./build.sh --use_openmp ${BUILDARGS} --build_shared_lib
# Build Python Bindings and Wheel
RUN ./build.sh --use_openmp ${BUILDARGS} --enable_pybind --build_wheel
RUN ./build.sh --use_openmp ${BUILDARGS} --update --build --build_shared_lib --build_wheel
# Build Output
RUN ls -l /code/onnxruntime/build/Linux/${BUILDTYPE}/*.so

View file

@ -61,11 +61,12 @@ class Tensor final {
Tensor() = default; // to allow creating vector<Tensor> to support seq(tensor)
/**
* Create tensor with given type, shape, pre-allocate memory and allocator info.
* Create tensor with given type, shape, pre-allocated memory and allocator info.
* This function won't check if the preallocated buffer(p_data) has enough room for the shape.
* \param data A preallocated buffer. Can be NULL if the shape is empty.
* Tensor does not own the data and will not delete it
* \param alloc Where the buffer('data') was allocated from
* \param offset Offset in bytes to start of Tensor within p_data.
*/
Tensor(MLDataType p_type, const TensorShape& shape, void* p_data, const OrtMemoryInfo& alloc,
ptrdiff_t offset = 0);
@ -74,7 +75,7 @@ class Tensor final {
* Deprecated. The orginal design is this Tensor class won't do any allocation / release.
* However, this function will allocate the buffer for the shape, and do placement new if p_type is string tensor.
*/
Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator, ptrdiff_t offset = 0);
Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator);
~Tensor();

View file

@ -16,7 +16,7 @@ SparseTensor::SparseTensor(MLDataType elt_type,
void* values_data,
void* indices_data,
const OrtMemoryInfo& memory_info)
: values_(elt_type, TensorShape({static_cast<int64_t>(nnz)}), values_data, memory_info, 0),
: values_(elt_type, TensorShape({static_cast<int64_t>(nnz)}), values_data, memory_info),
indices_(DataTypeImpl::GetType<int64_t>(),
TensorShape({static_cast<int64_t>(nnz), static_cast<int64_t>(shape.NumDimensions())}),
indices_data, memory_info, 0),
@ -26,10 +26,10 @@ SparseTensor::SparseTensor(MLDataType elt_type,
const TensorShape& shape,
size_t nnz,
std::shared_ptr<IAllocator> allocator)
: values_(elt_type, TensorShape({static_cast<int64_t>(nnz)}), allocator, 0),
: values_(elt_type, TensorShape({static_cast<int64_t>(nnz)}), allocator),
indices_(DataTypeImpl::GetType<int64_t>(),
TensorShape({static_cast<int64_t>(nnz), static_cast<int64_t>(shape.NumDimensions())}),
allocator, 0),
allocator),
shape_(shape) {}
} // namespace onnxruntime

View file

@ -17,7 +17,7 @@ Tensor::Tensor(MLDataType p_type, const TensorShape& shape, void* p_data, const
Init(p_type, shape, p_data, nullptr, offset);
}
Tensor::Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator, ptrdiff_t offset)
Tensor::Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator)
: alloc_info_(allocator->Info()) {
ORT_ENFORCE(p_type != nullptr);
int64_t shape_size = shape.Size(); // value returned is checked for overflow by TensorShape::Size()
@ -30,13 +30,10 @@ Tensor::Tensor(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAll
if (!allocator->CalcMemSizeForArray(SafeInt<size_t>(shape_size), p_type->Size(), &len))
ORT_THROW("tensor failed memory size calculation");
// TODO: Use case for this isn't clear. We allocate a buffer based on the tensor shape and increase it by offset.
// Who is going to use the memory prior to offset, and/or why should it be allocated here?
len += offset;
p_data = allocator->Alloc(len);
}
Init(p_type, shape, p_data, allocator, offset);
Init(p_type, shape, p_data, allocator);
}
size_t Tensor::SizeInBytes() const {

View file

@ -13,42 +13,43 @@
namespace onnxruntime {
namespace test {
template <typename T>
void CPUTensorTest(std::vector<int64_t> dims, const int offset = 0) {
//not own the buffer
TensorShape shape(dims);
void CPUTensorTest(std::vector<int64_t> dims, const int offset_elements = 0) {
// create Tensor where we provide the buffer
TensorShape shape(dims); // this is the shape that will be available starting at the offset in the Tensor
auto alloc = TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault);
auto data = alloc->Alloc(sizeof(T) * (shape.Size() + offset));
EXPECT_TRUE(data);
Tensor t(DataTypeImpl::GetType<T>(), shape, data, alloc->Info(), offset);
// alloc extra data if needed, as anything before the offset is not covered by the shape
auto num_elements = shape.Size() + offset_elements;
auto num_bytes = num_elements * sizeof(T);
auto offset_bytes = offset_elements * sizeof(T);
void* data = alloc->Alloc(num_bytes);
const T* first_element = static_cast<const T*>(data) + offset_elements;
Tensor t(DataTypeImpl::GetType<T>(), shape, data, alloc->Info(), offset_bytes);
auto tensor_shape = t.Shape();
//Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213
EXPECT_EQ(*reinterpret_cast<const std::vector<int64_t>*>(&shape), *reinterpret_cast<const std::vector<int64_t>*>(&tensor_shape));
EXPECT_EQ(shape.GetDims(), tensor_shape.GetDims());
EXPECT_EQ(t.DataType(), DataTypeImpl::GetType<T>());
auto& location = t.Location();
EXPECT_STREQ(location.name, CPU);
EXPECT_EQ(location.id, 0);
auto t_data = t.template MutableData<T>();
EXPECT_TRUE(t_data);
memset(t_data, 0, sizeof(T) * shape.Size());
EXPECT_EQ(*(T*)((char*)data + offset), (T)0);
const T* t_data = t.Data<T>();
EXPECT_EQ(first_element, t_data);
alloc->Free(data);
Tensor new_t(DataTypeImpl::GetType<T>(), shape, alloc, offset);
// test when the Tensor allocates the buffer.
// there's no point using an offset_elements here as you'd be allocating extra data prior to the buffer needed
// by the Tensor instance.
if (offset_elements == 0) {
Tensor new_t(DataTypeImpl::GetType<T>(), shape, alloc);
EXPECT_TRUE(new_t.OwnsBuffer());
tensor_shape = new_t.Shape();
//Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213
EXPECT_EQ(*reinterpret_cast<const std::vector<int64_t>*>(&shape), *reinterpret_cast<const std::vector<int64_t>*>(&tensor_shape));
EXPECT_EQ(new_t.DataType(), DataTypeImpl::GetType<T>());
auto& new_location = new_t.Location();
ASSERT_STREQ(new_location.name, CPU);
EXPECT_EQ(new_location.id, 0);
auto new_data = new_t.template MutableData<T>();
EXPECT_TRUE(new_data);
memset(new_data, 0, sizeof(T) * shape.Size());
EXPECT_EQ(*(T*)((char*)new_data + offset), (T)0);
//no free op as the tensor own the buffer
tensor_shape = new_t.Shape();
EXPECT_EQ(shape.GetDims(), tensor_shape.GetDims());
EXPECT_EQ(new_t.DataType(), DataTypeImpl::GetType<T>());
auto& new_location = new_t.Location();
ASSERT_STREQ(new_location.name, CPU);
EXPECT_EQ(new_location.id, 0);
}
}
TEST(TensorTest, CPUFloatTensorTest) {
@ -208,11 +209,6 @@ TEST(TensorTest, SizeOverflow) {
Tensor t(type, shape1, nullptr, alloc->Info());
EXPECT_THROW(t.SizeInBytes(), OnnxRuntimeException);
// overflow due to offset. max/4 from shape, *4 from float size, + 4 from offset
TensorShape shape2({static_cast<int64_t>(std::numeric_limits<size_t>::max() / 4)});
ptrdiff_t offset = sizeof(float); // one more element to push past max
EXPECT_THROW(Tensor(type, shape2, alloc, offset), OnnxRuntimeException);
}
} // namespace test
} // namespace onnxruntime

View file

@ -9,60 +9,60 @@ namespace test {
TEST_F(ActivationOpTest, Sigmoid) {
TestActivationOp("Sigmoid",
input_values,
[](float x) {
auto y = 1.f / (1.f + std::exp(-std::abs(x))); // safe sigmoid
y = x > 0 ? y : 1 - y;
return y;
});
input_values,
[](float x) {
auto y = 1.f / (1.f + std::exp(-std::abs(x))); // safe sigmoid
y = x > 0 ? y : 1 - y;
return y;
});
}
TEST_F(ActivationOpTest, HardSigmoid) {
float alpha = 0.2f;
float beta = 0.5f;
TestActivationOp("HardSigmoid",
input_values,
[alpha, beta](float x) {
return std::max(std::min((alpha * x + beta), 1.0f), 0.0f);
},
{{"alpha", alpha}, {"beta", beta}});
input_values,
[alpha, beta](float x) {
return std::max(std::min((alpha * x + beta), 1.0f), 0.0f);
},
{{"alpha", alpha}, {"beta", beta}});
}
TEST_F(ActivationOpTest, Tanh) {
TestActivationOp("Tanh",
input_values,
[](float x) { return std::tanh(x); });
input_values,
[](float x) { return std::tanh(x); });
}
TEST_F(ActivationOpTest, Relu) {
TestActivationOp("Relu",
input_values,
[](float x) { return std::max(x, 0.0f); });
input_values,
[](float x) { return std::max(x, 0.0f); });
}
TEST_F(ActivationOpTest, Elu) {
float alpha = 0.1f;
TestActivationOp("Elu",
input_values,
[alpha](float x) { return (x >= 0) ? x : alpha * (exp(x) - 1); },
{{"alpha", alpha}});
input_values,
[alpha](float x) { return (x >= 0) ? x : alpha * (exp(x) - 1); },
{{"alpha", alpha}});
}
TEST_F(ActivationOpTest, LeakyRelu) {
float alpha = 0.1f;
TestActivationOp("LeakyRelu",
input_values,
[alpha](float x) { return (x >= 0) ? x : alpha * x; },
{{"alpha", alpha}});
input_values,
[alpha](float x) { return (x >= 0) ? x : alpha * x; },
{{"alpha", alpha}});
}
TEST_F(ActivationOpTest, ThresholdedRelu) {
float alpha = 0.1f;
TestActivationOp(
"ThresholdedRelu",
input_values,
[alpha](float x) { return (x >= alpha) ? x : 0; },
{{"alpha", alpha}}, true, 10);
"ThresholdedRelu",
input_values,
[alpha](float x) { return (x >= alpha) ? x : 0; },
{{"alpha", alpha}}, true, 10);
}
TEST_F(ActivationOpTest, Selu) {
@ -70,9 +70,9 @@ TEST_F(ActivationOpTest, Selu) {
static constexpr float gamma = 1.0507f;
TestActivationOp("Selu",
input_values,
[](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
{{"alpha", alpha}, {"gamma", gamma}});
input_values,
[](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
{{"alpha", alpha}, {"gamma", gamma}});
}
TEST_F(ActivationOpTest, Selu_Attributes) {
@ -80,9 +80,9 @@ TEST_F(ActivationOpTest, Selu_Attributes) {
static constexpr float gamma = 0.5f;
TestActivationOp("Selu",
input_values,
[](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
{{"alpha", alpha}, {"gamma", gamma}});
input_values,
[](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
{{"alpha", alpha}, {"gamma", gamma}});
}
TEST_F(ActivationOpTest, PRelu) {
@ -145,20 +145,46 @@ TEST_F(ActivationOpTest, PRelu_MultiChannel) {
TEST_F(ActivationOpTest, Softplus) {
TestActivationOp("Softplus",
input_values,
[](float x) {
if (x > 0)
return x + logf(expf(-x) + 1);
else
return logf(expf(x) + 1);
});
input_values,
[](float x) {
if (x > 0)
return x + logf(expf(-x) + 1);
else
return logf(expf(x) + 1);
});
}
TEST_F(ActivationOpNoInfTest, Softsign) {
TestActivationOp(
"Softsign",
input_values,
[](float x) { return x / (1 + std::abs(x)); }, {}, false); // Disable TensorRT because result mismatches
"Softsign",
input_values,
[](float x) {
auto result = x / (1 + std::abs(x));
#if defined(__arm__)
// Softsign uses Eigen inverse(), which on ARM32 results in a different value when x is FLT_MAX or -FLT_MAX
// 3.40282347e+38 -> 0 with ARM32 inverse() vs something like 2.939e-39#DEN with other platforms.
//
// Possibly explained by https://en.wikipedia.org/wiki/ARM_architecture#Advanced_SIMD_(Neon)
// 'A quirk of Neon in Armv7 devices is that it flushes all subnormal numbers to zero'
//
// c.f.
// cmake\external\eigen\Eigen\src\Core\arch\SSE\PacketMath.h uses _mm_div_ps for 'pdiv<Packet4f>'
// cmake\external\eigen\Eigen\src\Core\arch\NEON\PacketMath.h uses a custom implementation for 'pdiv<Packet4f>'
//
// Special case the expected values to allow for that. If handling FLT_MAX more consistently is required
// we'd need to not use Eigen for Softsign on ARM32.
//
if (x == FLT_MAX) {
result = 0.;
} else if (x == -FLT_MAX) {
result = -0.;
}
#endif
return result;
},
{}, false); // Disable TensorRT because result mismatches
}
} // namespace test

View file

@ -598,7 +598,7 @@ TEST(ReductionOpTest, ReduceMax_int32) {
#if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16) || defined(OPENVINO_CONFIG_MYRIAD)
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); // OpenVINO: Disabled temporarily
#else
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: axis must be 0
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: axis must be 0
#endif
}
@ -619,7 +619,7 @@ TEST(ReductionOpTest, ReduceMax_int64) {
#if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16) || defined(OPENVINO_CONFIG_MYRIAD)
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); // OpenVINO: Disabled temporarily
#else
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: axis must be 0
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: axis must be 0
#endif
}
@ -640,7 +640,7 @@ TEST(ReductionOpTest, ReduceMax_int8) {
#if defined(OPENVINO_CONFIG_MYRIAD)
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); // OpenVINO: Disabled temporarily
#else
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: axis must be 0
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: axis must be 0
#endif
}
@ -661,7 +661,7 @@ TEST(ReductionOpTest, ReduceMax_uint8) {
#if defined(OPENVINO_CONFIG_MYRIAD)
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); // OpenVINO: Disabled temporarily
#else
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: axis must be 0
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: axis must be 0
#endif
}
@ -720,6 +720,14 @@ TEST(ReductionOpTest, ReduceMean_do_not_keepdims) {
55.0f, 1.0f,
60.0f, 2.0f});
test.AddOutput<float>("reduced", {3, 2}, {12.5f, 1.5f, 35.0f, 1.5f, 57.5f, 1.5f});
#if defined(__arm__)
// armv7 isn't as accurate so need to add a little tolerance for the diffs
// expected[i] evaluates to 35,
// output[i] evaluates to 34.999866485595703
test.SetOutputRelErr("reduced", 1e-5f);
#endif
test.Run();
}
@ -747,6 +755,14 @@ TEST(ReductionOpTest, ReduceMean_keepdims) {
55.0f, 1.0f,
60.0f, 2.0f});
test.AddOutput<float>("reduced", {3, 1, 2}, {12.5f, 1.5f, 35.0f, 1.5f, 57.5f, 1.5f});
#if defined(__arm__)
// armv7 isn't as accurate so need to add a little tolerance for the diffs
// expected[i] evaluates to 35,
// output[i] evaluates to 34.999866485595703
test.SetOutputRelErr("reduced", 1e-5f);
#endif
test.Run();
}
@ -764,6 +780,7 @@ TEST(ReductionOpTest, ReduceMean) {
9.0f, 10.0f,
11.0f, 12.0f});
test.AddOutput<float>("reduced", {1, 2, 1}, {5.5f, 7.5f});
test.Run();
}