mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-18 21:21:17 +00:00
Enable Arm Compute Library 23.08 (#17672)
### Description This PR enables onnxruntime to build with the most recent release of Arm Compute Library ### Motivation and Context The latest version of Arm Compute Library that onnxruntime can build is 20.02 which is more than 3 years old.
This commit is contained in:
parent
a2afd92093
commit
37ac9d391c
9 changed files with 113 additions and 20 deletions
|
|
@ -131,6 +131,7 @@ option(onnxruntime_USE_ACL_1902 "Build with ACL version 1902 support" OFF)
|
|||
option(onnxruntime_USE_ACL_1905 "Build with ACL version 1905 support" OFF)
|
||||
option(onnxruntime_USE_ACL_1908 "Build with ACL version 1908 support" OFF)
|
||||
option(onnxruntime_USE_ACL_2002 "Build with ACL version 2002 support" OFF)
|
||||
option(onnxruntime_USE_ACL_2308 "Build with ACL version 2308 support" OFF)
|
||||
option(onnxruntime_USE_ARMNN "Build with ArmNN support" OFF)
|
||||
option(onnxruntime_ARMNN_RELU_USE_CPU "Use the CPU implementation for the Relu operator for the ArmNN EP" ON)
|
||||
option(onnxruntime_ARMNN_BN_USE_CPU "Use the CPU implementation for the Batch Normalization operator for the ArmNN EP" ON)
|
||||
|
|
@ -1110,7 +1111,7 @@ function(onnxruntime_add_include_to_target dst_target)
|
|||
endfunction()
|
||||
|
||||
# ACL
|
||||
if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905 OR onnxruntime_USE_ACL_1908 OR onnxruntime_USE_ACL_2002)
|
||||
if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905 OR onnxruntime_USE_ACL_1908 OR onnxruntime_USE_ACL_2002 OR onnxruntime_USE_ACL_2308)
|
||||
set(onnxruntime_USE_ACL ON)
|
||||
if (onnxruntime_USE_ACL_1902)
|
||||
add_definitions(-DACL_1902=1)
|
||||
|
|
@ -1121,7 +1122,11 @@ if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905
|
|||
if (onnxruntime_USE_ACL_2002)
|
||||
add_definitions(-DACL_2002=1)
|
||||
else()
|
||||
add_definitions(-DACL_1905=1)
|
||||
if (onnxruntime_USE_ACL_2308)
|
||||
add_definitions(-DACL_2308=1)
|
||||
else()
|
||||
add_definitions(-DACL_1905=1)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -49,11 +49,18 @@ class Gemm : public onnxruntime::Gemm<T> {
|
|||
}
|
||||
|
||||
Status Compute(OpKernelContext* context) const override {
|
||||
#ifdef ACL_2308
|
||||
if (this->packed_b_) {
|
||||
// Prepacked RHS not supported, defaulting to cpu execution provider
|
||||
return onnxruntime::Gemm<T>::Compute(context);
|
||||
}
|
||||
#endif
|
||||
const auto A = context->Input<Tensor>(0);
|
||||
const auto B = context->Input<Tensor>(1);
|
||||
const auto C = context->Input<Tensor>(2);
|
||||
|
||||
GemmHelper helper(A->Shape(), trans_A_ != CblasNoTrans, B->Shape(), trans_B_ != CblasNoTrans, C->Shape());
|
||||
GemmHelper helper(A->Shape(), trans_A_ != CblasNoTrans, B->Shape(), trans_B_ != CblasNoTrans,
|
||||
C != nullptr ? C->Shape() : TensorShape({}));
|
||||
|
||||
if (!helper.State().IsOK())
|
||||
return helper.State();
|
||||
|
|
@ -70,7 +77,7 @@ class Gemm : public onnxruntime::Gemm<T> {
|
|||
return onnxruntime::Gemm<T>::Compute(context);
|
||||
}
|
||||
|
||||
arm_compute::TensorShape cShape = ACLTensorShape(C->Shape());
|
||||
arm_compute::TensorShape cShape = ACLTensorShape(C != nullptr ? C->Shape() : TensorShape({}));
|
||||
if (useC &&
|
||||
(cShape.num_dimensions() > 2 ||
|
||||
(cShape.num_dimensions() == 2 && cShape[0] > 1 && cShape[1] > 1))) { // Multi-dimensional Bias
|
||||
|
|
@ -89,8 +96,13 @@ class Gemm : public onnxruntime::Gemm<T> {
|
|||
(cShape[1] == 1 && cShape[0] != (long unsigned int)N)) {
|
||||
return onnxruntime::Gemm<T>::Compute(context);
|
||||
}
|
||||
#ifdef ACL_2308
|
||||
cShape = arm_compute::TensorShape(N);
|
||||
LOGS_DEFAULT(VERBOSE) << "Bias reshaped to: {" << N << "}";
|
||||
#else
|
||||
cShape = arm_compute::TensorShape(1, N);
|
||||
LOGS_DEFAULT(VERBOSE) << "Bias reshaped to: {1," << N << "}";
|
||||
#endif
|
||||
}
|
||||
|
||||
int64_t K = helper.K();
|
||||
|
|
|
|||
|
|
@ -44,6 +44,16 @@ Status BatchNorm<T>::Compute(OpKernelContext* context) const {
|
|||
const Tensor* M = context->Input<Tensor>(3); // mean
|
||||
const Tensor* V = context->Input<Tensor>(4); // var
|
||||
|
||||
if (S->Shape().NumDimensions() > 1) {
|
||||
LOGS_DEFAULT(WARNING) << "ACL does not support scale with dimension greater then 1; defaulting to cpu implementation";
|
||||
return onnxruntime::BatchNorm<T>::Compute(context);
|
||||
}
|
||||
|
||||
if (this->is_train_) {
|
||||
LOGS_DEFAULT(WARNING) << "ACL does not have batchnorm training support; defaulting to cpu implementation";
|
||||
return onnxruntime::BatchNorm<T>::Compute(context);
|
||||
}
|
||||
|
||||
ORT_RETURN_IF_ERROR(BatchNormHelper::ValidateInputs(X, S, B, M, V));
|
||||
|
||||
LOGS_DEFAULT(VERBOSE) << "BatchNorm ACL:";
|
||||
|
|
@ -70,7 +80,23 @@ Status BatchNorm<T>::Compute(OpKernelContext* context) const {
|
|||
|
||||
auto layer = std::make_shared<arm_compute::NEBatchNormalizationLayer>();
|
||||
|
||||
#ifdef ACL_2308
|
||||
arm_compute::TensorShape in_x_shape;
|
||||
const TensorShape& x_shape = X->Shape();
|
||||
const auto& dims_vec = x_shape.GetDims();
|
||||
in_x_shape.set(3, onnxruntime::narrow<size_t>(dims_vec[0])); // N
|
||||
in_x_shape.set(1, 1); // H
|
||||
size_t W = 1;
|
||||
for (size_t i = 2; i < dims_vec.size(); ++i) {
|
||||
W *= narrow<size_t>(dims_vec[i]);
|
||||
}
|
||||
in_x_shape.set(0, W); // W
|
||||
in_x_shape.set(2, onnxruntime::narrow<size_t>(dims_vec[1])); // C
|
||||
|
||||
tbatch_norm.in->allocator()->init(arm_compute::TensorInfo(in_x_shape, arm_compute::Format::F32));
|
||||
#else
|
||||
tbatch_norm.in->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(X->Shape()), arm_compute::Format::F32));
|
||||
#endif
|
||||
tbatch_norm.out->allocator()->init(arm_compute::TensorInfo(tbatch_norm.in->info()->tensor_shape(), arm_compute::Format::F32));
|
||||
|
||||
tbatch_norm.scale->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(S->Shape()), arm_compute::Format::F32));
|
||||
|
|
@ -132,11 +158,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
|
|||
7, 9,
|
||||
kAclExecutionProvider,
|
||||
KernelDefBuilder()
|
||||
.TypeConstraint("X", DataTypeImpl::GetTensorType<float>())
|
||||
.TypeConstraint("scale", DataTypeImpl::GetTensorType<float>())
|
||||
.TypeConstraint("B", DataTypeImpl::GetTensorType<float>())
|
||||
.TypeConstraint("mean", DataTypeImpl::GetTensorType<float>())
|
||||
.TypeConstraint("var", DataTypeImpl::GetTensorType<float>()),
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
|
||||
BatchNorm<float>);
|
||||
|
||||
} // namespace acl
|
||||
|
|
|
|||
|
|
@ -31,9 +31,9 @@ typedef struct {
|
|||
typedef std::map<OpKernel*, ACLNEBatchNorm>::iterator BatchNormLayersIterator;
|
||||
|
||||
template <typename T>
|
||||
class BatchNorm final : public OpKernel {
|
||||
class BatchNorm : public onnxruntime::BatchNorm<T> {
|
||||
public:
|
||||
explicit BatchNorm(const OpKernelInfo& info) : OpKernel(info) {
|
||||
explicit BatchNorm(const OpKernelInfo& info) : onnxruntime::BatchNorm<T>(info) {
|
||||
auto st = info.GetAttr<float>("epsilon", &epsilon_);
|
||||
ORT_ENFORCE(st.IsOK(), st.ErrorMessage());
|
||||
|
||||
|
|
|
|||
|
|
@ -105,7 +105,11 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
|
|||
TensorShapeVector Y_dims;
|
||||
Y_dims.insert(Y_dims.begin(), {N, M});
|
||||
TensorShape input_shape = X->Shape().Slice(2);
|
||||
#ifdef ACL_2308
|
||||
ORT_RETURN_IF_ERROR(conv_attrs_.InferPadsAndOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
|
||||
#else
|
||||
ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
|
||||
#endif
|
||||
Tensor* Y = context->Output(0, TensorShape(Y_dims));
|
||||
LOGS_DEFAULT(VERBOSE) << "Y " << Y->Shape().ToString().c_str();
|
||||
|
||||
|
|
@ -222,6 +226,15 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
|
|||
1 /* depth multiplier */,
|
||||
acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(),
|
||||
arm_compute::Size2D(aclDilation0, dilations[0])));
|
||||
#elif defined(ACL_2308)
|
||||
bool optimizable = bool(arm_compute::NEDepthwiseConvolutionLayer::validate(tconv.in->info(),
|
||||
tconv.k->info(),
|
||||
(B != nullptr) ? tconv.b->info() : nullptr,
|
||||
tconv.out->info(),
|
||||
aclPadStride,
|
||||
1 /* depth multiplier */,
|
||||
acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(),
|
||||
arm_compute::Size2D(aclDilation0, dilations[0])));
|
||||
#endif
|
||||
|
||||
if (optimizable) {
|
||||
|
|
@ -230,7 +243,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
|
|||
auto layer = std::make_shared<arm_compute::NEDepthwiseConvolutionLayer3x3>();
|
||||
#elif defined(ACL_1908)
|
||||
auto layer = std::make_shared<arm_compute::NEDepthwiseConvolutionLayerOptimized>();
|
||||
#elif defined(ACL_2002)
|
||||
#elif defined(ACL_2002) || defined(ACL_2308)
|
||||
auto layer = std::make_shared<arm_compute::NEDepthwiseConvolutionLayer>();
|
||||
#endif
|
||||
|
||||
|
|
@ -238,7 +251,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
|
|||
layer->configure(tconv.in.get(), tconv.k.get(), (B != nullptr) ? tconv.b.get() : nullptr, tconv.out.get(),
|
||||
aclPadStride, 1 /* depth multiplier */,
|
||||
acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo());
|
||||
#elif defined(ACL_1905) || defined(ACL_1908) || defined(ACL_2002)
|
||||
#elif defined(ACL_1905) || defined(ACL_1908) || defined(ACL_2002) || defined(ACL_2308)
|
||||
layer->configure(tconv.in.get(), tconv.k.get(), (B != nullptr) ? tconv.b.get() : nullptr, tconv.out.get(),
|
||||
aclPadStride, 1 /* depth multiplier */,
|
||||
acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(),
|
||||
|
|
|
|||
|
|
@ -8,6 +8,9 @@
|
|||
#include "core/providers/acl/acl_execution_provider.h"
|
||||
|
||||
// ACL
|
||||
#ifdef ACL_2308
|
||||
#include "arm_compute/runtime/Tensor.h"
|
||||
#endif
|
||||
#include "arm_compute/core/TensorInfo.h"
|
||||
#include "arm_compute/runtime/TensorAllocator.h"
|
||||
#include "arm_compute/runtime/Allocator.h"
|
||||
|
|
|
|||
|
|
@ -61,7 +61,14 @@ ACLNEPool PoolOperation(onnxruntime::OpKernelContext* context,
|
|||
tpool.out->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(Y->Shape(), PREF_DIM), arm_compute::Format::F32));
|
||||
|
||||
if (pool_attrs.global_pooling) {
|
||||
layer->configure(tpool.in.get(), tpool.out.get(), arm_compute::PoolingLayerInfo(pool_type));
|
||||
layer->configure(tpool.in.get(),
|
||||
tpool.out.get(),
|
||||
arm_compute::PoolingLayerInfo(pool_type
|
||||
#ifdef ACL_2308
|
||||
,
|
||||
arm_compute::DataLayout::NCHW
|
||||
#endif
|
||||
));
|
||||
} else {
|
||||
TensorShapeVector aclStrides(2);
|
||||
aclStrides[0] = (strides.size() == 2) ? strides[1] : 1;
|
||||
|
|
@ -104,7 +111,13 @@ ACLNEPool PoolOperation(onnxruntime::OpKernelContext* context,
|
|||
LOGS_DEFAULT(VERBOSE) << "strides: {" << aclStrides[0] << "," << aclStrides[1] << "}";
|
||||
LOGS_DEFAULT(VERBOSE) << "excludePadding: " << excludePadding;
|
||||
|
||||
arm_compute::PoolingLayerInfo pool_info(pool_type, aclSize, aclPadStride, excludePadding);
|
||||
arm_compute::PoolingLayerInfo pool_info(pool_type,
|
||||
aclSize,
|
||||
#ifdef ACL_2308
|
||||
arm_compute::DataLayout::NCHW,
|
||||
#endif
|
||||
aclPadStride,
|
||||
excludePadding);
|
||||
layer->configure(tpool.in.get(), tpool.out.get(), pool_info);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -10,6 +10,8 @@
|
|||
#include "core/providers/acl/acl_common.h"
|
||||
#include "core/providers/acl/acl_fwd.h"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#define PREF_DIM 4
|
||||
|
||||
namespace onnxruntime {
|
||||
|
|
@ -22,17 +24,27 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
|
|||
return onnxruntime::Concat::Compute(ctx);
|
||||
}
|
||||
|
||||
if (axis_ < 0) {
|
||||
LOGS_DEFAULT(WARNING) << "ACL does not have support for negative axis; defaulting to cpu implementation";
|
||||
return onnxruntime::Concat::Compute(ctx);
|
||||
}
|
||||
|
||||
// Number of input tensors to concatenate
|
||||
auto input_count = Node().InputArgCount().front();
|
||||
|
||||
// Hold pointers to the input tensors to be used in the PrepareForCompute() step
|
||||
std::vector<const Tensor*> input_tensors;
|
||||
input_tensors.reserve(input_count);
|
||||
int empty_tensors = 0;
|
||||
for (int i = 0; i < input_count; ++i) {
|
||||
if (ctx->Input<Tensor>(i)->Shape().Size() == 0) {
|
||||
empty_tensors++;
|
||||
continue;
|
||||
}
|
||||
input_tensors.push_back(ctx->Input<Tensor>(i));
|
||||
}
|
||||
input_count -= empty_tensors;
|
||||
|
||||
auto output_dims = input_tensors[0]->Shape().AsShapeVector();
|
||||
auto output_dims = ctx->Input<Tensor>(0)->Shape().AsShapeVector();
|
||||
|
||||
// 'Concat' mode
|
||||
if (!is_stack_) {
|
||||
|
|
@ -64,7 +76,11 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
|
|||
LOGS_DEFAULT(VERBOSE) << "Concat ACL:";
|
||||
|
||||
arm_compute::Tensor output;
|
||||
#ifdef ACL_2308
|
||||
std::vector<const arm_compute::ITensor*> inputs_vector;
|
||||
#else
|
||||
std::vector<arm_compute::ITensor*> inputs_vector;
|
||||
#endif
|
||||
for (int i = 0; i < input_count; i++) {
|
||||
arm_compute::Tensor* input = new arm_compute::Tensor();
|
||||
auto X = input_tensors[i];
|
||||
|
|
@ -75,7 +91,9 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
|
|||
}
|
||||
|
||||
arm_compute::NEConcatenateLayer layer;
|
||||
layer.configure(inputs_vector, &output, 3 - axis_);
|
||||
if (input_count > 0) {
|
||||
layer.configure(inputs_vector, &output, 3 - axis_);
|
||||
}
|
||||
|
||||
LOGS_DEFAULT(VERBOSE) << "axis: " << axis_;
|
||||
LOGS_DEFAULT(VERBOSE) << std::endl;
|
||||
|
|
@ -83,7 +101,11 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
|
|||
for (int i = 0; i < input_count; i++) {
|
||||
auto X = input_tensors[i];
|
||||
const T* x_data = X->Data<T>();
|
||||
#ifdef ACL_2308
|
||||
arm_compute::Tensor* in = const_cast<arm_compute::Tensor*>(static_cast<const arm_compute::Tensor*>(inputs_vector[i]));
|
||||
#else
|
||||
arm_compute::Tensor* in = static_cast<arm_compute::Tensor*>(inputs_vector[i]);
|
||||
#endif
|
||||
|
||||
if (X->Shape().Size() != 0 && in->info()->has_padding()) {
|
||||
in->allocator()->allocate();
|
||||
|
|
@ -101,7 +123,9 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
|
|||
ACLImportMemory(output.allocator(), (void*)y_data, Y->Shape().Size() * 4);
|
||||
}
|
||||
|
||||
layer.run();
|
||||
if (input_count > 0) {
|
||||
layer.run();
|
||||
}
|
||||
|
||||
if (Y->Shape().Size() != 0 && output.info()->has_padding()) {
|
||||
importDataFromTensor<T>(&output, y_data);
|
||||
|
|
|
|||
|
|
@ -606,7 +606,7 @@ def parse_arguments():
|
|||
"--use_acl",
|
||||
nargs="?",
|
||||
const="ACL_1905",
|
||||
choices=["ACL_1902", "ACL_1905", "ACL_1908", "ACL_2002"],
|
||||
choices=["ACL_1902", "ACL_1905", "ACL_1908", "ACL_2002", "ACL_2308"],
|
||||
help="Build with ACL for ARM architectures.",
|
||||
)
|
||||
parser.add_argument("--acl_home", help="Path to ACL home dir")
|
||||
|
|
@ -1031,6 +1031,7 @@ def generate_build_tree(
|
|||
"-Donnxruntime_USE_ACL_1905=" + ("ON" if args.use_acl == "ACL_1905" else "OFF"),
|
||||
"-Donnxruntime_USE_ACL_1908=" + ("ON" if args.use_acl == "ACL_1908" else "OFF"),
|
||||
"-Donnxruntime_USE_ACL_2002=" + ("ON" if args.use_acl == "ACL_2002" else "OFF"),
|
||||
"-Donnxruntime_USE_ACL_2308=" + ("ON" if args.use_acl == "ACL_2308" else "OFF"),
|
||||
"-Donnxruntime_USE_ARMNN=" + ("ON" if args.use_armnn else "OFF"),
|
||||
"-Donnxruntime_ARMNN_RELU_USE_CPU=" + ("OFF" if args.armnn_relu else "ON"),
|
||||
"-Donnxruntime_ARMNN_BN_USE_CPU=" + ("OFF" if args.armnn_bn else "ON"),
|
||||
|
|
|
|||
Loading…
Reference in a new issue