diff --git a/CMakeLists.txt b/CMakeLists.txt index 93793d22790..48730cfa555 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1250,6 +1250,18 @@ if(USE_MIMALLOC) include_directories(third_party/mimalloc/include) endif() +if(USE_MKLDNN_ACL) + find_package(ACL REQUIRED) + if(ACL_FOUND) + include_directories(${ACL_INCLUDE_DIRS}) + message(STATUS "ACL Include: ${ACL_INCLUDE_DIRS}") + message(STATUS "ACL Library: ${ACL_LIBRARIES}") + else() + message(FATAL_ERROR "ACL not found") + endif() +endif() + + if(USE_MIMALLOC AND USE_MIMALLOC_ON_MKL) add_definitions(-DUSE_MIMALLOC_ON_MKL) endif() diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index ab95de5036b..02511d889da 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -458,6 +458,12 @@ if(MKLDNN_FOUND) list(APPEND ATen_CPU_DEPENDENCY_LIBS ${MKLDNN_LIBRARIES}) endif(MKLDNN_FOUND) +if(USE_MKLDNN_ACL) + list(APPEND ATen_CPU_INCLUDE ${ACL_INCLUDE_DIRS}) + list(APPEND ATen_CPU_DEPENDENCY_LIBS ${ACL_LIBRARIES}) +endif() + + if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$") list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo) endif() diff --git a/aten/src/ATen/native/quantized/cpu/ACLUtils.h b/aten/src/ATen/native/quantized/cpu/ACLUtils.h new file mode 100644 index 00000000000..a33ec5143db --- /dev/null +++ b/aten/src/ATen/native/quantized/cpu/ACLUtils.h @@ -0,0 +1,251 @@ +#pragma once + +#include +#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED() + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using ACLDynamicQuantMatmulCacheKey = std::tuple< + int64_t, // M + bool, // FUSE_RELU + int64_t // NUM_THREADS + >; + +enum ACLDynamicQuantMatmulCacheKeyIndex { + M, + FUSE_RELU, + NUM_THREADS, +}; + +struct ACLDynamicQuantMatmul { + arm_compute::Tensor src_s8_tensor; + arm_compute::Tensor src_fp32_tensor; + arm_compute::Tensor wei_tensor; + arm_compute::Tensor bia_tensor; + arm_compute::Tensor dst_tensor; + arm_compute::NEQuantizationLayer quant; + std::shared_ptr memory_manager{ + arm_compute::MemoryManagerOnDemand::make_default()}; + arm_compute::NEGEMMLowpMatrixMultiplyCore gemm{memory_manager}; + arm_compute::NEActivationLayer acl_relu; + // configuration details for the ACL gemm + arm_compute::TensorInfo src_s8_tensor_info; + arm_compute::TensorInfo src_fp32_tensor_info; + arm_compute::TensorInfo wei_tensor_info; + arm_compute::TensorInfo bia_tensor_info; + arm_compute::TensorInfo dst_tensor_info; + arm_compute::GEMMInfo gemm_info; + arm_compute::ActivationLayerInfo acl_relu_info{ + arm_compute::ActivationFunction::RELU}; + bool with_bias{false}; + + // key for use in the cache + ACLDynamicQuantMatmulCacheKey key; + + ~ACLDynamicQuantMatmul() { + // this will free memory allocated for the quantized src tensor since the + // allocation happened through ACL: src_s8_tensor.allocator()->allocate() + src_s8_tensor.allocator()->free(); + // this will not free memory, it will just tell ACL that we're no longer + // using the pointer + wei_tensor.allocator()->free(); + if (with_bias) { + bia_tensor.allocator()->free(); + } + // deallocate memory used for auxiliary tensors + memory_manager->clear(); + } +}; + +struct PackedLinearWeightsACL : public PackedLinearWeightsOnednn { + PackedLinearWeightsACL( + std::unique_ptr weight, + std::optional bias, + at::Tensor orig_weight, + std::optional orig_bias) + : PackedLinearWeightsOnednn( + std::move(weight), + std::move(bias), + std::move(orig_weight), + std::move(orig_bias)) { + auto w = *(weight_.get()); + k_ = w.get_dim(0); + n_ = w.get_dim(1); + wei_zero_point_ = orig_weight_.q_zero_point(); + wei_scale_ = orig_weight_.q_scale(); + } + + int64_t k_; + int64_t n_; + int64_t wei_zero_point_; + double wei_scale_; + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false) + override; + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false) + override; + + std::shared_ptr get_acl_dynamic_quant_matmul( + const ACLDynamicQuantMatmulCacheKey& key) { + // We're only maintaining a 2 element LRU cache + // hit first + if (acl_dynamic_quant_cache[0] != nullptr && + acl_dynamic_quant_cache[0]->key == key) { + return acl_dynamic_quant_cache[0]; + } + // hit second + if (acl_dynamic_quant_cache[1] != nullptr && + acl_dynamic_quant_cache[1]->key == key) { + // update LRU + std::rotate( + acl_dynamic_quant_cache.begin(), + acl_dynamic_quant_cache.begin() + 1, + acl_dynamic_quant_cache.end()); + return acl_dynamic_quant_cache[0]; + } + // miss -> replace Least Recently Used - i.e. element at index 1 + acl_dynamic_quant_cache[1] = create_acl_dynamic_quant_matmul(key); + std::rotate( + acl_dynamic_quant_cache.begin(), + acl_dynamic_quant_cache.begin() + 1, + acl_dynamic_quant_cache.end()); + return acl_dynamic_quant_cache[0]; + } + + private: + // A 2 element (per layer) cache. Given it's not intended to store more than 2 + // elements, we do not need a fancy implementation. The idea behind it is to + // allow for a (configuration free) fast path for autoregressive + // transformer-like models which usually involve 2 input tensor shapes; one + // for the prefill phase and another for the autoregressive phase + std::array, 2> acl_dynamic_quant_cache; + + std::shared_ptr create_acl_dynamic_quant_matmul( + const ACLDynamicQuantMatmulCacheKey& key) { + int64_t m = std::get(key); + bool fuse_relu = std::get(key); + auto acl_gemm = std::make_shared(); + acl_gemm->with_bias = bias_.has_value(); + acl_gemm->key = key; + acl_gemm->src_fp32_tensor_info = arm_compute::TensorInfo( + arm_compute::TensorShape(k_, m), arm_compute::Format::F32); + + acl_gemm->src_fp32_tensor_info.set_are_values_constant(false); + + acl_gemm->src_s8_tensor_info = arm_compute::TensorInfo( + arm_compute::TensorShape(k_, m), + 1, + arm_compute::DataType::QASYMM8_SIGNED, + // TODO: setting the initial offset value to int8_t max instead of zero, + // because ACL currently skips MatrixBReduction calculation if the + // source offset at configuration time is zero. This is fixed by this + // PR: https://review.mlplatform.org/c/ml/ComputeLibrary/+/12820/8 This + // will be set to the actual src offset value at runtime. + arm_compute::QuantizationInfo( + 1.0, std::numeric_limits::max(), true)); + acl_gemm->src_s8_tensor_info.set_are_values_constant(false); + + acl_gemm->wei_tensor_info = arm_compute::TensorInfo( + arm_compute::TensorShape(n_, k_), + 1, + arm_compute::DataType::QASYMM8_SIGNED, + arm_compute::QuantizationInfo(wei_scale_, wei_zero_point_, true)); + acl_gemm->wei_tensor_info.set_are_values_constant(true); + + acl_gemm->bia_tensor_info = arm_compute::TensorInfo( + arm_compute::TensorShape(), 1, arm_compute::DataType::F32); + if (acl_gemm->with_bias) { + acl_gemm->bia_tensor_info.set_tensor_shape( + arm_compute::TensorShape(1, n_)); + } + acl_gemm->dst_tensor_info = arm_compute::TensorInfo( + arm_compute::TensorShape(n_, m), arm_compute::Format::F32); + + // validate that ACL can handle the given problem and inputs. + if (fuse_relu) { + arm_compute::Status relu_status = + arm_compute::NEActivationLayer::validate( + &acl_gemm->dst_tensor_info, + &acl_gemm->dst_tensor_info, + acl_gemm->acl_relu_info); + if (relu_status.error_code() != arm_compute::ErrorCode::OK) { + return nullptr; + } + } + arm_compute::Status quant_status = + arm_compute::NEQuantizationLayer::validate( + &acl_gemm->src_fp32_tensor_info, &acl_gemm->src_s8_tensor_info); + if (quant_status.error_code() != arm_compute::ErrorCode::OK) { + return nullptr; + } + arm_compute::Status gemm_status = + arm_compute::NEGEMMLowpMatrixMultiplyCore::validate( + &acl_gemm->src_s8_tensor_info, + &acl_gemm->wei_tensor_info, + acl_gemm->with_bias ? &acl_gemm->bia_tensor_info : nullptr, + &acl_gemm->dst_tensor_info, + acl_gemm->gemm_info); + + if (gemm_status.error_code() != arm_compute::ErrorCode::OK) { + return nullptr; + } + + // set the tensor info (i.e. shape, datatype, quant info) for the ACL + // tensors + acl_gemm->src_fp32_tensor.allocator()->init(acl_gemm->src_fp32_tensor_info); + acl_gemm->src_s8_tensor.allocator()->init(acl_gemm->src_s8_tensor_info); + acl_gemm->wei_tensor.allocator()->init(acl_gemm->wei_tensor_info); + if (acl_gemm->with_bias) { + acl_gemm->bia_tensor.allocator()->init(acl_gemm->bia_tensor_info); + } + acl_gemm->dst_tensor.allocator()->init(acl_gemm->dst_tensor_info); + + // allocate memory only for the quantized tensor, the rest will use memory + // already avaliable from PyTorch + acl_gemm->src_s8_tensor.allocator()->allocate(); + // give ACL access to weight and bias pointer + acl_gemm->wei_tensor.allocator()->import_memory( + (int8_t*)weight_.get()->get_data_handle()); + if (bias_.has_value()) { + acl_gemm->bia_tensor.allocator()->import_memory( + (float*)bias_.value().get_data_handle()); + } + + // configure + acl_gemm->quant.configure( + &acl_gemm->src_fp32_tensor, &acl_gemm->src_s8_tensor); + + acl_gemm->gemm.configure( + &acl_gemm->src_s8_tensor, + &acl_gemm->wei_tensor, + acl_gemm->with_bias ? &acl_gemm->bia_tensor : nullptr, + &acl_gemm->dst_tensor, + acl_gemm->gemm_info); + + if (fuse_relu) { + acl_gemm->acl_relu.configure( + &acl_gemm->dst_tensor, + &acl_gemm->dst_tensor, + acl_gemm->acl_relu_info); + } + + // allocate memory for ACL's auxiliary tensors + arm_compute::Allocator alloc{}; + acl_gemm->memory_manager->populate(alloc, 1); + + return acl_gemm; + } + template + at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range = false); +}; + +#endif // #if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED() diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp index 792db3242bc..7763b994d91 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -697,6 +698,127 @@ static at::Tensor linear_dynamic_fp16_with_onednn_weight( primitive.execute(ideep::stream::default_stream(), args); return dim == 2 ? output : output.reshape(output_size); } + +#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED() + +template +at::Tensor PackedLinearWeightsACL::apply_dynamic_impl( + at::Tensor input, + bool reduce_range) { + // Dynamic: fp32 * int8 -> fp32 + using at::Tensor; + + TORCH_CHECK( + input.dim() >= 2, + "The dimension of input tensor should be larger than or equal to 2"); + TORCH_CHECK( + input.scalar_type() == c10::ScalarType::Float, + "qlinear_dynamic (ONEDNN): data type of input should be float."); + + auto input_contig = input.contiguous(); + const int64_t dim = input.dim(); + auto input_reshaped = + dim == 2 ? input : input.reshape({-1, input.size(input.dim() - 1)}); + auto input_dims = input_reshaped.sizes().vec(); + + int64_t m = input_dims[0]; + auto key = std::make_tuple( + m, ReluFused, static_cast(at::get_num_threads())); + auto acl_gemm = get_acl_dynamic_quant_matmul(key); + + if (acl_gemm) { + // Find quantization parameters + float x_max = 0, x_min = 0; + +#ifdef USE_FBGEMM + // Use FBGEMM's FindMinMax if available since it's faster + fbgemm::FindMinMax( + /*m=*/input_contig.data_ptr(), + /*min=*/&x_min, + /*max=*/&x_max, + /*len=*/input.numel()); +#else + if (input_contig.numel() > 0) { + auto [t_min, t_max] = at::aminmax(input_contig); + x_max = t_max.item(); + x_min = t_min.item(); + } +#endif + + auto q_params = quant_utils::ChooseQuantizationParams( + /*min=*/x_min, + /*max=*/x_max, + /*qmin=*/std::numeric_limits::min(), + /*qmax=*/std::numeric_limits::max(), + /*preserve_sparsity=*/false, + /*force_scale_power_of_two=*/false, + /*reduce_range=*/reduce_range); + + acl_gemm->src_fp32_tensor.allocator()->import_memory( + (float*)input_contig.data_ptr()); + + acl_gemm->src_s8_tensor.info()->set_quantization_info( + arm_compute::QuantizationInfo( + q_params.scale, q_params.zero_point, true)); + + // quantize src tensor: fp32 -> s8 + acl_gemm->quant.run(); + + // allocation for fp32 out tensor + at::Tensor output = at::empty({m, n_}, input.options().dtype(at::kFloat)); + if (output.numel() == 0) + return output; + + // We set the offset to "-zero_point" for the GEMM, but to "zero_point" for + // the quantization layer This is a known inconsistency in ACL. + acl_gemm->src_s8_tensor.info()->set_quantization_info( + arm_compute::QuantizationInfo( + q_params.scale, -q_params.zero_point, true)); + + acl_gemm->dst_tensor.allocator()->import_memory((float*)output.data_ptr()); + + // s8 src, s8 wei -> f32 dst + acl_gemm->gemm.run(); + + if (ReluFused) { + acl_gemm->acl_relu.run(); + } + + // this will not free memory, it will just tell ACL that we're no longer + // using the pointer + acl_gemm->src_fp32_tensor.allocator()->free(); + acl_gemm->dst_tensor.allocator()->free(); + + auto out_sizes = input.sizes().vec(); + out_sizes.back() = n_; + if (output.sizes().vec() == out_sizes) + return output; + return output.reshape(out_sizes); + } + + // fallback to oneDNN in the unlikely scinario that ACL's validation fails + if (ReluFused) { + return PackedLinearWeightsOnednn::apply_dynamic_relu(input, reduce_range); + } else { + return PackedLinearWeightsOnednn::apply_dynamic(input, reduce_range); + } +} + +at::Tensor PackedLinearWeightsACL::apply_dynamic( + at::Tensor input, + bool reduce_range) { + return apply_dynamic_impl( + std::move(input), reduce_range); +} + +at::Tensor PackedLinearWeightsACL::apply_dynamic_relu( + at::Tensor input, + bool reduce_range) { + return apply_dynamic_impl(std::move(input), reduce_range); +} + +#endif // #if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED() + #endif // #if AT_MKLDNN_ENABLED() namespace at::native { diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp index 1bd41367c71..198cf415382 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp @@ -1,15 +1,16 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include -#include +#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include #include -#include -#include #include #include #include @@ -279,12 +280,15 @@ c10::intrusive_ptr PackedLinearWeightsOnednn::prepack( packed_bias.init(bias_desc, b.data_ptr()); onednn_bias = std::optional(packed_bias); } - auto ret_ptr = c10::make_intrusive( - PackedLinearWeightsOnednn{ - std::move(weight_ptr), - onednn_bias, - weight, - bias}); +#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED() + if (qtype == c10::kPerTensorAffine) { + return c10::make_intrusive(PackedLinearWeightsACL{ + std::move(weight_ptr), onednn_bias, weight, bias}); + } +#endif // #if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED() + auto ret_ptr = + c10::make_intrusive(PackedLinearWeightsOnednn{ + std::move(weight_ptr), onednn_bias, weight, bias}); return ret_ptr; } diff --git a/cmake/Modules/FindACL.cmake b/cmake/Modules/FindACL.cmake new file mode 100644 index 00000000000..12ddf384dff --- /dev/null +++ b/cmake/Modules/FindACL.cmake @@ -0,0 +1,61 @@ +# ---------- +# FindACL +# ---------- +# +# Finds the Arm Compute Library +# https://arm-software.github.io/ComputeLibrary/latest/ +# +# This module defines the following variables: +# +# ACL_FOUND - True if ACL was found +# ACL_INCLUDE_DIRS - include directories for ACL +# ACL_LIBRARIES - link against this library to use ACL +# +# The module will also define two cache variables: +# +# ACL_INCLUDE_DIR - the ACL include directory +# ACL_LIBRARY - the path to the ACL library +# + +# Use ACL_ROOT_DIR environment variable to find the library and headers +find_path(ACL_INCLUDE_DIR + NAMES arm_compute/graph.h + PATHS ENV ACL_ROOT_DIR + ) + +find_library(ACL_LIBRARY + NAMES arm_compute + PATHS ENV ACL_ROOT_DIR + PATH_SUFFIXES lib build + ) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(ACL DEFAULT_MSG + ACL_INCLUDE_DIR + ACL_LIBRARY +) + +mark_as_advanced( + ACL_LIBRARY + ACL_INCLUDE_DIR + ) + +# Find the extra libraries and include dirs +if(ACL_FOUND) + find_path(ACL_EXTRA_INCLUDE_DIR + NAMES half/half.hpp + PATHS ENV ACL_ROOT_DIR + PATH_SUFFIXES include + ) + + find_library(ACL_GRAPH_LIBRARY + NAMES arm_compute_graph + PATHS ENV ACL_ROOT_DIR + PATH_SUFFIXES lib build + ) + + list(APPEND ACL_INCLUDE_DIRS + ${ACL_INCLUDE_DIR} ${ACL_EXTRA_INCLUDE_DIR}) + list(APPEND ACL_LIBRARIES + ${ACL_LIBRARY} ${ACL_GRAPH_LIBRARY}) +endif()