Nnapi, add auto_pad support for Conv/GlobalAveragePool/AveragePool/GlobalMaxPool/MaxPool operators (#4499)

* Split ComputePadAndOutputShape into ComputePad and ComputeOutputShape * update NNAPI conv ouput shape compute to use shared ComputeOutputShapec * move use ptr to use reference for ComputePadAndOutputShape * nnapi conv support auto_pad * add logging operator support bt target devices * update InferOutputShape/ComputePadAndOutputShape/ComputePad to use force_symmetric_auto_padding as param instead of template * make log op support for target devices optional * add auto_pad support to pool operators * ignore GetTargetDevices if using all devices * fix some typo in padding calculation * fix a bug of compute padding difference between conv and pool ops * addressed CR comments, removed NNAPI device logging and move nnapi ep autopad handling into a shared function * change helper functions to static
2026-06-04 23:59:56 +00:00 · 2020-07-15 00:21:42 -07:00 · 2020-07-15 00:21:42 -07:00 · cf92497c16
commit cf92497c16
parent 34f73fa1aa
12 changed files with 419 additions and 317 deletions
--- a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
+++ b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
@ -152,7 +152,7 @@ Status NchwcConv::Compute(OpKernelContext* context) const {
  std::vector<int64_t> Y_dims;
  Y_dims.insert(Y_dims.begin(), {X_shape[0], W_shape[0]});
  TensorShape input_shape = X->Shape().Slice(2);
-  ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, &pads, &Y_dims));
+  ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
  auto* Y = context->Output(0, Y_dims);
  auto* y_data = Y->template MutableData<float>();

--- a/onnxruntime/core/providers/acl/nn/conv.cc
+++ b/onnxruntime/core/providers/acl/nn/conv.cc
@ -60,7 +60,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
  ConvLayersIterator it = Conv::convLayers.find((OpKernel*)this);
  if (it != Conv::convLayers.end()) {
    pConv = &it->second;
-    if(pConv->isDepthwiseCPU == true) {
+    if (pConv->isDepthwiseCPU == true) {
      Status s = onnxruntime::Conv<T>::Compute(context);
      return s;
    }
@ -103,7 +103,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
  std::vector<int64_t> Y_dims;
  Y_dims.insert(Y_dims.begin(), {N, M});
  TensorShape input_shape = X->Shape().Slice(2);
-  ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, &pads, &Y_dims));
+  ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
  Tensor* Y = context->Output(0, TensorShape(Y_dims));
  LOGS_DEFAULT(VERBOSE) << "Y " << Y->Shape().ToString().c_str() << std::endl;

@ -127,7 +127,6 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
  }

  if (it == Conv::convLayers.end()) {
-
    auto mm_layer = ACLCreateMemoryManager();

    ACLNEConv tconv;
@ -192,21 +191,21 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
      // in the configure function for NEDepthwiseConvolutionLayer3x3, there is a separation based on the optimization
 #ifdef ACL_1902
      bool optimizable =
-            arm_compute::NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(tconv.in->info()->tensor_shape(),
-                                                                                               aclPadStride,
-                                                                                               tconv.in->info()->data_type(),
-                                                                                               1 /* depth multiplier */,
-                                                                                               tconv.in->info()->data_layout());
+          arm_compute::NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(tconv.in->info()->tensor_shape(),
+                                                                                             aclPadStride,
+                                                                                             tconv.in->info()->data_type(),
+                                                                                             1 /* depth multiplier */,
+                                                                                             tconv.in->info()->data_layout());
 #endif
 #if defined(ACL_1905) || defined(ACL_1908)
      bool optimizable =
-            arm_compute::NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(tconv.in->info(),
-                                                                                        tconv.k->info(),
-                                                                                        aclPadStride,
-                                                                                        1 /* depth multiplier */,
-                                                                                        arm_compute::Size2D(aclDilation0, dilations[0]));
+          arm_compute::NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(tconv.in->info(),
+                                                                                      tconv.k->info(),
+                                                                                      aclPadStride,
+                                                                                      1 /* depth multiplier */,
+                                                                                      arm_compute::Size2D(aclDilation0, dilations[0]));
 #endif
-      if(optimizable) {
+      if (optimizable) {
        //optimized depthwise convolution
 #if defined(ACL_1902) || defined(ACL_1905)
        auto layer = std::make_shared<arm_compute::NEDepthwiseConvolutionLayer3x3>();
@ -234,9 +233,9 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
        ret = Conv::convLayers.insert(std::pair<OpKernel*, ACLNEConv>((OpKernel*)this, tconv));
        return s;
      }
-#endif //DEPTHWISE_CPU
+#endif  //DEPTHWISE_CPU
    } else {
-      if(tconv.k->info()->tensor_shape()[0] == 1 && tconv.k->info()->tensor_shape()[1] == 1) {
+      if (tconv.k->info()->tensor_shape()[0] == 1 && tconv.k->info()->tensor_shape()[1] == 1) {
        //pointwise convolution
        Status s = onnxruntime::Conv<T>::Compute(context);
        return s;
@ -291,7 +290,6 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
    pConv->b->allocator()->free();
  pConv->out->allocator()->free();

-
  return Status::OK();
 }
 #else
--- a/onnxruntime/core/providers/armnn/nn/conv.cc
+++ b/onnxruntime/core/providers/armnn/nn/conv.cc
@ -26,258 +26,253 @@ thread_local std::map<OpKernel*, armnn::NetworkId> Conv<T>::convLayers;
 template <typename T>
 armnn::IRuntimePtr Conv<T>::run = Conv<T>::initRuntime();

-armnn::Convolution2dDescriptor createConvDescriptor(std::vector<int64_t> pads, std::vector<int64_t> dilations, std::vector<int64_t> strides, bool biasEnabled){
+armnn::Convolution2dDescriptor createConvDescriptor(std::vector<int64_t> pads, std::vector<int64_t> dilations, std::vector<int64_t> strides, bool biasEnabled) {
+  std::vector<int64_t> armnnStrides(2);
+  armnnStrides[0] = (strides.size() == 2) ? strides[1] : 1;
+  armnnStrides[1] = strides[0];

-    std::vector<int64_t> armnnStrides(2);
-    armnnStrides[0] = (strides.size() == 2) ? strides[1] : 1;
-    armnnStrides[1] = strides[0];
+  std::vector<int64_t> armnnDilations(2);
+  armnnDilations[0] = (dilations.size() == 2) ? dilations[1] : 1;
+  armnnDilations[1] = dilations[0];

-    std::vector<int64_t> armnnDilations(2);
-    armnnDilations[0] = (dilations.size() == 2) ? dilations[1] : 1;
-    armnnDilations[1] = dilations[0];
-
-    std::vector<int64_t> armnnPads(4);
-    if (pads.size() == 2) {
-      if (strides.size() == 1) {
-        armnnPads[0] = 0;
-        armnnPads[1] = 0;
-        armnnPads[2] = pads[1];
-        armnnPads[3] = pads[0];
-      } else {
-        armnnPads[0] = pads[1];
-        armnnPads[1] = pads[0];
-        armnnPads[2] = pads[1];
-        armnnPads[3] = pads[0];
-      }
+  std::vector<int64_t> armnnPads(4);
+  if (pads.size() == 2) {
+    if (strides.size() == 1) {
+      armnnPads[0] = 0;
+      armnnPads[1] = 0;
+      armnnPads[2] = pads[1];
+      armnnPads[3] = pads[0];
    } else {
      armnnPads[0] = pads[1];
-      armnnPads[1] = pads[3];
-      armnnPads[2] = pads[0];
-      armnnPads[3] = pads[2];
+      armnnPads[1] = pads[0];
+      armnnPads[2] = pads[1];
+      armnnPads[3] = pads[0];
    }
+  } else {
+    armnnPads[0] = pads[1];
+    armnnPads[1] = pads[3];
+    armnnPads[2] = pads[0];
+    armnnPads[3] = pads[2];
+  }

-    armnn::Convolution2dDescriptor convolutionDescriptor;
-    convolutionDescriptor.m_PadLeft = armnnPads[0];
-    convolutionDescriptor.m_PadRight = armnnPads[1];
-    convolutionDescriptor.m_PadTop = armnnPads[2];
-    convolutionDescriptor.m_PadBottom = armnnPads[3];
-    convolutionDescriptor.m_StrideX = armnnStrides[0];
-    convolutionDescriptor.m_StrideY = armnnStrides[1];
-    convolutionDescriptor.m_DilationX = armnnDilations[0];
-    convolutionDescriptor.m_DilationY = armnnDilations[1];
-    convolutionDescriptor.m_BiasEnabled = biasEnabled;
-    convolutionDescriptor.m_DataLayout = armnn::DataLayout::NCHW;
+  armnn::Convolution2dDescriptor convolutionDescriptor;
+  convolutionDescriptor.m_PadLeft = armnnPads[0];
+  convolutionDescriptor.m_PadRight = armnnPads[1];
+  convolutionDescriptor.m_PadTop = armnnPads[2];
+  convolutionDescriptor.m_PadBottom = armnnPads[3];
+  convolutionDescriptor.m_StrideX = armnnStrides[0];
+  convolutionDescriptor.m_StrideY = armnnStrides[1];
+  convolutionDescriptor.m_DilationX = armnnDilations[0];
+  convolutionDescriptor.m_DilationY = armnnDilations[1];
+  convolutionDescriptor.m_BiasEnabled = biasEnabled;
+  convolutionDescriptor.m_DataLayout = armnn::DataLayout::NCHW;

-    return convolutionDescriptor;
+  return convolutionDescriptor;
 }

-armnn::DepthwiseConvolution2dDescriptor createDepthwiseDescriptor(armnn::Convolution2dDescriptor convolutionDescriptor){
+armnn::DepthwiseConvolution2dDescriptor createDepthwiseDescriptor(armnn::Convolution2dDescriptor convolutionDescriptor) {
+  armnn::DepthwiseConvolution2dDescriptor depthwiseDescriptor;
+  depthwiseDescriptor.m_PadLeft = convolutionDescriptor.m_PadLeft;
+  depthwiseDescriptor.m_PadRight = convolutionDescriptor.m_PadRight;
+  depthwiseDescriptor.m_PadTop = convolutionDescriptor.m_PadTop;
+  depthwiseDescriptor.m_PadBottom = convolutionDescriptor.m_PadBottom;
+  depthwiseDescriptor.m_StrideX = convolutionDescriptor.m_StrideX;
+  depthwiseDescriptor.m_StrideY = convolutionDescriptor.m_StrideY;
+  depthwiseDescriptor.m_DilationX = convolutionDescriptor.m_DilationX;
+  depthwiseDescriptor.m_DilationY = convolutionDescriptor.m_DilationY;
+  depthwiseDescriptor.m_BiasEnabled = convolutionDescriptor.m_BiasEnabled;
+  depthwiseDescriptor.m_DataLayout = convolutionDescriptor.m_DataLayout;

-    armnn::DepthwiseConvolution2dDescriptor depthwiseDescriptor;
-    depthwiseDescriptor.m_PadLeft      = convolutionDescriptor.m_PadLeft;
-    depthwiseDescriptor.m_PadRight     = convolutionDescriptor.m_PadRight;
-    depthwiseDescriptor.m_PadTop       = convolutionDescriptor.m_PadTop;
-    depthwiseDescriptor.m_PadBottom    = convolutionDescriptor.m_PadBottom;
-    depthwiseDescriptor.m_StrideX      = convolutionDescriptor.m_StrideX;
-    depthwiseDescriptor.m_StrideY      = convolutionDescriptor.m_StrideY;
-    depthwiseDescriptor.m_DilationX    = convolutionDescriptor.m_DilationX;
-    depthwiseDescriptor.m_DilationY    = convolutionDescriptor.m_DilationY;
-    depthwiseDescriptor.m_BiasEnabled  = convolutionDescriptor.m_BiasEnabled;
-    depthwiseDescriptor.m_DataLayout   = convolutionDescriptor.m_DataLayout;
-
-    return depthwiseDescriptor;
+  return depthwiseDescriptor;
 }

 template <typename T>
 Status Conv<T>::Compute(OpKernelContext* context) const {
  size_t num_inputs = OpKernel::Node().InputDefs().size();
  const Tensor* X = context->Input<Tensor>(0);
-    const Tensor* W = context->Input<Tensor>(1);
-    const Tensor* B = num_inputs == 3 ? context->Input<Tensor>(2) : nullptr;
+  const Tensor* W = context->Input<Tensor>(1);
+  const Tensor* B = num_inputs == 3 ? context->Input<Tensor>(2) : nullptr;

-    const int64_t N = X->Shape()[0];
-    const int64_t M = W->Shape()[0];
+  const int64_t N = X->Shape()[0];
+  const int64_t M = W->Shape()[0];

-    if (X->Shape().NumDimensions() != PREF_DIM) {
+  if (X->Shape().NumDimensions() != PREF_DIM) {
+    Status s = onnxruntime::Conv<T>::Compute(context);
+    return s;
+  }
+
+  ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X, W));
+
+  std::vector<int64_t> kernel_shape;
+  ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W->Shape(), kernel_shape));
+
+  std::vector<int64_t> pads(conv_attrs_.pads);
+  if (pads.empty()) {
+    pads.resize(kernel_shape.size() * 2, 0);
+  }
+  std::vector<int64_t> dilations(conv_attrs_.dilations);
+  if (dilations.empty()) {
+    dilations.resize(kernel_shape.size(), 1);
+  }
+  std::vector<int64_t> strides(conv_attrs_.strides);
+  if (strides.empty()) {
+    strides.resize(kernel_shape.size(), 1);
+  }
+
+  std::vector<int64_t> Y_dims;
+  Y_dims.insert(Y_dims.begin(), {N, M});
+  TensorShape input_shape = X->Shape().Slice(2);
+  ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
+  Tensor* Y = context->Output(0, TensorShape(Y_dims));
+
+  bool biasEnabled = B != nullptr;
+
+  const T* x_data = X->template Data<T>();
+  const T* k_data = W->template Data<T>();
+
+  const T* b_data;
+  if (biasEnabled) {
+    b_data = B->template Data<T>();
+  }
+
+  T* y_data = Y->template MutableData<T>();
+
+  armnn::NetworkId* pNetworkId;
+  ConvLayersIterator it = Conv::convLayers.find((OpKernel*)this);
+  if (it == Conv::convLayers.end()) {
+    armnn::NetworkId networkId;
+    armnn::INetworkPtr myNetwork = armnn::INetwork::Create();
+
+    armnn::Convolution2dDescriptor convolutionDescriptor = createConvDescriptor(pads, dilations, strides, biasEnabled);
+
+    armnn::IConnectableLayer* convolution_armnn;
+    armnn::TensorShape inputShape = ArmNNTensorShape(X->Shape());
+    armnn::TensorShape weightShape = ArmNNTensorShape(W->Shape());
+
+    if (weightShape[2] == 1 && weightShape[3] == 1) {
      Status s = onnxruntime::Conv<T>::Compute(context);
      return s;
    }

-    ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X, W));
+    if (conv_attrs_.group > 1) {
+      if (conv_attrs_.group == inputShape[1]) {
+        // depthwise convolution
+        armnn::DepthwiseConvolution2dDescriptor depthwiseDescriptor = createDepthwiseDescriptor(convolutionDescriptor);

-    std::vector<int64_t> kernel_shape;
-    ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W->Shape(), kernel_shape));
-
-    std::vector<int64_t> pads(conv_attrs_.pads);
-    if (pads.empty()) {
-      pads.resize(kernel_shape.size() * 2, 0);
-    }
-    std::vector<int64_t> dilations(conv_attrs_.dilations);
-    if (dilations.empty()) {
-      dilations.resize(kernel_shape.size(), 1);
-    }
-    std::vector<int64_t> strides(conv_attrs_.strides);
-    if (strides.empty()) {
-      strides.resize(kernel_shape.size(), 1);
-    }
-
-    std::vector<int64_t> Y_dims;
-    Y_dims.insert(Y_dims.begin(), {N, M});
-    TensorShape input_shape = X->Shape().Slice(2);
-    ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, &pads, &Y_dims));
-    Tensor* Y = context->Output(0, TensorShape(Y_dims));
-
-    bool biasEnabled = B != nullptr;
-
-    const T* x_data = X->template Data<T>();
-    const T* k_data = W->template Data<T>();
-
-    const T* b_data;
-    if (biasEnabled) {
-      b_data = B->template Data<T>();
-    }
-
-    T* y_data = Y->template MutableData<T>();
-
-    armnn::NetworkId* pNetworkId;
-    ConvLayersIterator it = Conv::convLayers.find((OpKernel*)this);
-    if (it == Conv::convLayers.end()) {
-
-      armnn::NetworkId networkId;
-      armnn::INetworkPtr myNetwork = armnn::INetwork::Create();
-
-      armnn::Convolution2dDescriptor convolutionDescriptor = createConvDescriptor(pads, dilations, strides, biasEnabled);
-
-      armnn::IConnectableLayer *convolution_armnn;
-      armnn::TensorShape inputShape = ArmNNTensorShape(X->Shape());
-      armnn::TensorShape weightShape = ArmNNTensorShape(W->Shape());
-
-      if (weightShape[2] == 1 && weightShape[3] == 1) {
-        Status s = onnxruntime::Conv<T>::Compute(context);
-        return s;
-      }
-
-      if (conv_attrs_.group > 1) {
-
-        if (conv_attrs_.group == inputShape[1]) {
-          // depthwise convolution
-          armnn::DepthwiseConvolution2dDescriptor depthwiseDescriptor = createDepthwiseDescriptor(convolutionDescriptor);
-
-          weightShape[1] = weightShape[0];
-          weightShape[0] = 1;
-          armnn::TensorInfo weightsInfo(weightShape, armnn::DataType::Float32);
-          armnn::ConstTensor weights(weightsInfo, k_data);
-
-          if (biasEnabled) {
-            armnn::TensorInfo biasDesc(ArmNNTensorShape(B->Shape()), armnn::DataType::Float32);
-            armnn::ConstTensor bias(biasDesc, b_data);
-            convolution_armnn = myNetwork->AddDepthwiseConvolution2dLayer(depthwiseDescriptor,
-                                                                          weights,
-                                                                          armnn::Optional<armnn::ConstTensor>(bias),
-                                                                          "depthwise_convolution_armnn");
-          } else {
-            convolution_armnn = myNetwork->AddDepthwiseConvolution2dLayer(depthwiseDescriptor,
-                                                                          weights,
-                                                                          armnn::EmptyOptional(),
-                                                                          "depthwise_convolution_armnn");
-          }
-        } else {
-          // NCHWc convolution
-          Status s = onnxruntime::Conv<T>::Compute(context);
-          return s;
-        }
-      } else {
-        // normal convolution
+        weightShape[1] = weightShape[0];
+        weightShape[0] = 1;
        armnn::TensorInfo weightsInfo(weightShape, armnn::DataType::Float32);
        armnn::ConstTensor weights(weightsInfo, k_data);

        if (biasEnabled) {
          armnn::TensorInfo biasDesc(ArmNNTensorShape(B->Shape()), armnn::DataType::Float32);
          armnn::ConstTensor bias(biasDesc, b_data);
-          convolution_armnn = myNetwork->AddConvolution2dLayer(convolutionDescriptor,
-                                                               weights,
-                                                               armnn::Optional<armnn::ConstTensor>(bias),
-                                                               "convolution_armnn");
+          convolution_armnn = myNetwork->AddDepthwiseConvolution2dLayer(depthwiseDescriptor,
+                                                                        weights,
+                                                                        armnn::Optional<armnn::ConstTensor>(bias),
+                                                                        "depthwise_convolution_armnn");
        } else {
-          convolution_armnn = myNetwork->AddConvolution2dLayer(convolutionDescriptor,
-                                                               weights,
-                                                               armnn::EmptyOptional(),
-                                                               "convolution_armnn");
+          convolution_armnn = myNetwork->AddDepthwiseConvolution2dLayer(depthwiseDescriptor,
+                                                                        weights,
+                                                                        armnn::EmptyOptional(),
+                                                                        "depthwise_convolution_armnn");
        }
+      } else {
+        // NCHWc convolution
+        Status s = onnxruntime::Conv<T>::Compute(context);
+        return s;
      }
-
-      bool armnn_activ_enabled = false;
-      armnn::ActivationDescriptor desc;
-      desc.m_A = conv_attrs_.alpha;
-
-      if (activation_type == "Relu") {
-        desc.m_Function = armnn::ActivationFunction::ReLu;
-        armnn_activ_enabled = true;
-      } else if (activation_type == "LeakyRelu") {
-        desc.m_Function = armnn::ActivationFunction::LeakyReLu;
-        armnn_activ_enabled = true;
-      } else if (activation_type == "Tanh") {
-        desc.m_Function = armnn::ActivationFunction::TanH;
-        armnn_activ_enabled = true;
-      } else if (activation_type == "Sigmoid") {
-        desc.m_Function = armnn::ActivationFunction::Sigmoid;
-        armnn_activ_enabled = true;
-      } else if (!activation_type.empty()) {
-        ORT_NOT_IMPLEMENTED("Not implemented fused activation: ", activation_type);
-      }
-
-      armnn::IConnectableLayer* activation = myNetwork->AddActivationLayer(desc, "activation_armnn");
-
-      armnn::IConnectableLayer *InputLayer  = myNetwork->AddInputLayer(0);
-      armnn::IConnectableLayer *OutputLayer = myNetwork->AddOutputLayer(0);
-
-      InputLayer->GetOutputSlot(0).Connect(convolution_armnn->GetInputSlot(0));
-      if (armnn_activ_enabled) {
-        convolution_armnn->GetOutputSlot(0).Connect(activation->GetInputSlot(0));
-        activation->GetOutputSlot(0).Connect(OutputLayer->GetInputSlot(0));
-      }
-      else {
-        convolution_armnn->GetOutputSlot(0).Connect(OutputLayer->GetInputSlot(0));
-      }
-
-      //Set the tensors in the network.
-      armnn::TensorInfo inputTensorInfo(inputShape, armnn::DataType::Float32);
-      InputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo);
-
-      armnn::TensorInfo outputTensorInfo(ArmNNTensorShape(Y->Shape()), armnn::DataType::Float32);
-      convolution_armnn->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
-
-      if (armnn_activ_enabled) {
-        activation->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
-      }
-
-      // Optimise ArmNN network
-      armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, Conv::run->GetDeviceSpec());
-
-      if (optNet == nullptr) {
-        return onnxruntime::Conv<T>::Compute(context);
-      }
-
-      // Load graph into runtime
-      Conv::run->LoadNetwork(networkId, std::move(optNet));
-
-      std::pair<ConvLayersIterator, bool> ret;
-      ret = Conv::convLayers.insert(std::pair<OpKernel*, armnn::NetworkId>((OpKernel*)this, networkId));
-      pNetworkId = &ret.first->second;
-
    } else {
-      pNetworkId = &it->second;
+      // normal convolution
+      armnn::TensorInfo weightsInfo(weightShape, armnn::DataType::Float32);
+      armnn::ConstTensor weights(weightsInfo, k_data);
+
+      if (biasEnabled) {
+        armnn::TensorInfo biasDesc(ArmNNTensorShape(B->Shape()), armnn::DataType::Float32);
+        armnn::ConstTensor bias(biasDesc, b_data);
+        convolution_armnn = myNetwork->AddConvolution2dLayer(convolutionDescriptor,
+                                                             weights,
+                                                             armnn::Optional<armnn::ConstTensor>(bias),
+                                                             "convolution_armnn");
+      } else {
+        convolution_armnn = myNetwork->AddConvolution2dLayer(convolutionDescriptor,
+                                                             weights,
+                                                             armnn::EmptyOptional(),
+                                                             "convolution_armnn");
+      }
    }

-    armnn::InputTensors inputTensors{{0, armnn::ConstTensor(Conv::run->GetInputTensorInfo(*pNetworkId, 0),
-                                                            x_data)}};
-    armnn::OutputTensors outputTensors{{0, armnn::Tensor(Conv::run->GetOutputTensorInfo(*pNetworkId, 0),
-                                                         y_data)}};
+    bool armnn_activ_enabled = false;
+    armnn::ActivationDescriptor desc;
+    desc.m_A = conv_attrs_.alpha;

-    // Execute network
-    Conv::run->EnqueueWorkload(*pNetworkId, inputTensors, outputTensors);
+    if (activation_type == "Relu") {
+      desc.m_Function = armnn::ActivationFunction::ReLu;
+      armnn_activ_enabled = true;
+    } else if (activation_type == "LeakyRelu") {
+      desc.m_Function = armnn::ActivationFunction::LeakyReLu;
+      armnn_activ_enabled = true;
+    } else if (activation_type == "Tanh") {
+      desc.m_Function = armnn::ActivationFunction::TanH;
+      armnn_activ_enabled = true;
+    } else if (activation_type == "Sigmoid") {
+      desc.m_Function = armnn::ActivationFunction::Sigmoid;
+      armnn_activ_enabled = true;
+    } else if (!activation_type.empty()) {
+      ORT_NOT_IMPLEMENTED("Not implemented fused activation: ", activation_type);
+    }

-    return Status::OK();
+    armnn::IConnectableLayer* activation = myNetwork->AddActivationLayer(desc, "activation_armnn");
+
+    armnn::IConnectableLayer* InputLayer = myNetwork->AddInputLayer(0);
+    armnn::IConnectableLayer* OutputLayer = myNetwork->AddOutputLayer(0);
+
+    InputLayer->GetOutputSlot(0).Connect(convolution_armnn->GetInputSlot(0));
+    if (armnn_activ_enabled) {
+      convolution_armnn->GetOutputSlot(0).Connect(activation->GetInputSlot(0));
+      activation->GetOutputSlot(0).Connect(OutputLayer->GetInputSlot(0));
+    } else {
+      convolution_armnn->GetOutputSlot(0).Connect(OutputLayer->GetInputSlot(0));
+    }
+
+    //Set the tensors in the network.
+    armnn::TensorInfo inputTensorInfo(inputShape, armnn::DataType::Float32);
+    InputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo);
+
+    armnn::TensorInfo outputTensorInfo(ArmNNTensorShape(Y->Shape()), armnn::DataType::Float32);
+    convolution_armnn->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
+
+    if (armnn_activ_enabled) {
+      activation->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
+    }
+
+    // Optimise ArmNN network
+    armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, Conv::run->GetDeviceSpec());
+
+    if (optNet == nullptr) {
+      return onnxruntime::Conv<T>::Compute(context);
+    }
+
+    // Load graph into runtime
+    Conv::run->LoadNetwork(networkId, std::move(optNet));
+
+    std::pair<ConvLayersIterator, bool> ret;
+    ret = Conv::convLayers.insert(std::pair<OpKernel*, armnn::NetworkId>((OpKernel*)this, networkId));
+    pNetworkId = &ret.first->second;
+
+  } else {
+    pNetworkId = &it->second;
+  }
+
+  armnn::InputTensors inputTensors{{0, armnn::ConstTensor(Conv::run->GetInputTensorInfo(*pNetworkId, 0),
+                                                          x_data)}};
+  armnn::OutputTensors outputTensors{{0, armnn::Tensor(Conv::run->GetOutputTensorInfo(*pNetworkId, 0),
+                                                       y_data)}};
+
+  // Execute network
+  Conv::run->EnqueueWorkload(*pNetworkId, inputTensors, outputTensors);
+
+  return Status::OK();
 }

 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
@ -296,5 +291,5 @@ ONNX_OPERATOR_KERNEL_EX(
    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
    Conv<float>);

-}  // namespace armnn
+}  // namespace armnn_ep
 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/common.h
+++ b/onnxruntime/core/providers/common.h
@ -70,11 +70,11 @@ inline AutoPadType StringToAutoPadType(const std::string& str) {

 // helper function

-template <bool ForceSymmetricAutoPadding>
-Status ComputePad(const int64_t in_dim,
-                  const int64_t stride, const int64_t kernel, const int64_t dilation,
-                  AutoPadType pad_type,
-                  int64_t& pad_head, int64_t& pad_tail) {
+inline Status ComputePad(const int64_t in_dim,
+                         const int64_t stride, const int64_t kernel, const int64_t dilation,
+                         AutoPadType pad_type,
+                         int64_t& pad_head, int64_t& pad_tail,
+                         bool force_symmetric_auto_padding = false) {
  switch (pad_type) {
    case AutoPadType::NOTSET:
      break;
@ -91,7 +91,7 @@ Status ComputePad(const int64_t in_dim,
      int64_t legacy_target_size = (in_dim + stride - 1) / stride;
      int64_t pad_needed = (legacy_target_size - 1) * stride + kernel - in_dim;
      // make sure padding is symmetric
-      if (ForceSymmetricAutoPadding) {
+      if (force_symmetric_auto_padding) {
        // Inlining math::roundUpPow2() from util/math.h to avoid bringing in the transitive dependencies.
        pad_needed = (pad_needed + 1) & ~1;
      }
@ -117,14 +117,14 @@ inline int64_t ComputeOutputShape(const int64_t in_dim,
  return static_cast<int64_t>(static_cast<float>(in_dim + pad_head + pad_tail - dkernel) / stride + 1);
 }

-template <bool ForceSymmetricAutoPadding>
-Status ComputePadAndOutputShape(const int64_t in_dim,
-                                const int64_t stride, const int64_t kernel, const int64_t dilation,
-                                AutoPadType pad_type,
-                                int64_t& pad_head, int64_t& pad_tail,
-                                int64_t& out_dim) {
+inline Status ComputePadAndOutputShape(const int64_t in_dim,
+                                       const int64_t stride, const int64_t kernel, const int64_t dilation,
+                                       AutoPadType pad_type,
+                                       int64_t& pad_head, int64_t& pad_tail,
+                                       int64_t& out_dim,
+                                       bool force_symmetric_auto_padding = false) {
  ORT_RETURN_IF_ERROR(
-      ComputePad<ForceSymmetricAutoPadding>(in_dim, stride, kernel, dilation, pad_type, pad_head, pad_tail));
+      ComputePad(in_dim, stride, kernel, dilation, pad_type, pad_head, pad_tail, force_symmetric_auto_padding));
  out_dim = ComputeOutputShape(in_dim, stride, kernel, dilation, pad_head, pad_tail);
  return Status::OK();
 }
--- a/onnxruntime/core/providers/cpu/nn/conv.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv.cc
@ -50,7 +50,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {

  std::vector<int64_t> Y_dims({N, M});
  TensorShape input_shape = X->Shape().Slice(2);
-  ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, &pads, &Y_dims));
+  ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
  Tensor* Y = context->Output(0, Y_dims);
  TensorShape output_shape = Y->Shape().Slice(2);

@ -188,7 +188,7 @@ Status Conv<float>::Compute(OpKernelContext* context) const {

  std::vector<int64_t> Y_dims({N, M});
  TensorShape input_shape = X->Shape().Slice(2);
-  ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, &pads, &Y_dims));
+  ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
  Tensor* Y = context->Output(0, TensorShape(Y_dims));
  TensorShape output_shape = Y->Shape().Slice(2);

--- a/onnxruntime/core/providers/cpu/nn/conv_attributes.h
+++ b/onnxruntime/core/providers/cpu/nn/conv_attributes.h
@ -104,34 +104,34 @@ struct ConvAttributes {
    return Status::OK();
  }

-  template <bool ForceSymmetricAutoPadding = false>
  Status InferOutputShape(const TensorShape& input_shape,
                          const std::vector<int64_t>& kernel_shape,
                          const std::vector<int64_t>& strides_p,
                          const std::vector<int64_t>& dilations_p,
-                          std::vector<int64_t>* pads_p,
-                          std::vector<int64_t>* output_shape) const {
+                          std::vector<int64_t>& pads_p,
+                          std::vector<int64_t>& output_shape,
+                          bool force_symmetric_auto_padding = false) const {
    size_t rank = input_shape.NumDimensions();
    for (size_t dim = 0; dim < rank; ++dim) {
      if (dim >= strides_p.size() || dim >= kernel_shape.size() ||
-          dim >= dilations_p.size() || dim >= pads_p->size() ||
-          rank + dim >= pads_p->size()) {
+          dim >= dilations_p.size() || dim >= pads_p.size() ||
+          rank + dim >= pads_p.size()) {
        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Out of bound access to array");
      }
      int64_t dim_size = 0;
-      ORT_RETURN_IF_ERROR(ComputePadAndOutputShape<ForceSymmetricAutoPadding>(
-          input_shape[dim],
-          strides_p[dim],
-          kernel_shape[dim],
-          dilations_p[dim],
-          auto_pad,
-          pads_p->at(dim),
-          pads_p->at(input_shape.NumDimensions() + dim),
-          dim_size));
+      ORT_RETURN_IF_ERROR(ComputePadAndOutputShape(input_shape[dim],
+                                                   strides_p[dim],
+                                                   kernel_shape[dim],
+                                                   dilations_p[dim],
+                                                   auto_pad,
+                                                   pads_p.at(dim),
+                                                   pads_p.at(input_shape.NumDimensions() + dim),
+                                                   dim_size,
+                                                   force_symmetric_auto_padding));
      if (dim_size <= 0) {
        return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Invalid input shape: " + input_shape.ToString());
      }
-      output_shape->push_back(dim_size);
+      output_shape.push_back(dim_size);
    }
    return Status::OK();
  }
--- a/onnxruntime/core/providers/cpu/nn/conv_integer.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv_integer.cc
@ -71,7 +71,7 @@ Status ConvInteger::Compute(OpKernelContext* context) const {

  std::vector<int64_t> Y_dims({N, M});
  TensorShape input_shape = X->Shape().Slice(2);
-  ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, &pads, &Y_dims));
+  ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
  Tensor* Y = context->Output(0, TensorShape(Y_dims));
  TensorShape output_shape = Y->Shape().Slice(2);

--- a/onnxruntime/core/providers/cpu/nn/qlinearconv.cc
+++ b/onnxruntime/core/providers/cpu/nn/qlinearconv.cc
@ -97,7 +97,7 @@ Status QLinearConv::Compute(OpKernelContext* context) const {

  std::vector<int64_t> Y_dims({N, M});
  TensorShape input_shape = X->Shape().Slice(2);
-  ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, &pads, &Y_dims));
+  ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
  Tensor* Y = context->Output(0, TensorShape(Y_dims));
  TensorShape output_shape = Y->Shape().Slice(2);

--- a/onnxruntime/core/providers/cuda/nn/conv.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv.cc
@ -89,8 +89,8 @@ Status Conv<T>::ComputeInternal(OpKernelContext* context) const {

      std::vector<int64_t> y_dims;
      y_dims.insert(y_dims.begin(), {N, M});
-      ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape<true>(x_shape.Slice(2), kernel_shape,
-                                                             strides, dilations, &pads, &y_dims));
+      ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(x_shape.Slice(2), kernel_shape,
+                                                       strides, dilations, pads, y_dims, true));
      s_.y_dims = y_dims;
      Tensor* Y = context->Output(0, TensorShape(s_.y_dims));
      y_data = reinterpret_cast<CudaT*>(Y->template MutableData<T>());
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@ -3,6 +3,7 @@

 #include <core/common/logging/logging.h>
 #include <core/common/safeint.h>
+#include <core/providers/common.h>
 #include <onnx/onnx_pb.h>

 #include "helper.h"
@ -258,6 +259,67 @@ uint32_t AddInitializerTransposed(ModelBuilder& model_builder,
  return operand_idx;
 }

+static vector<int32_t> ComputeConvPads(
+    const Shape& input_dimen,
+    const uint32_t weight_size_y, const uint32_t weight_size_x,
+    const std::vector<int32_t>& onnx_pads, const std::vector<int32_t>& onnx_strides, const std::vector<int32_t>& onnx_dilations,
+    AutoPadType auto_pad_type, bool nchw) {
+  const int32_t input_size_y = nchw ? input_dimen[2] : input_dimen[1];
+  const int32_t input_size_x = nchw ? input_dimen[3] : input_dimen[2];
+  const int32_t stride_y = onnx_strides[0];
+  const int32_t stride_x = onnx_strides[1];
+  const int32_t dilation_y = onnx_dilations[0];
+  const int32_t dilation_x = onnx_dilations[1];
+
+  int64_t padding_top = onnx_pads[0];
+  int64_t padding_bottom = onnx_pads[2];
+  int64_t padding_left = onnx_pads[1];
+  int64_t padding_right = onnx_pads[3];
+
+  ORT_THROW_IF_ERROR(ComputePad(input_size_y,
+                                stride_y, weight_size_y, dilation_y,
+                                auto_pad_type,
+                                padding_top, padding_bottom));
+  ORT_THROW_IF_ERROR(ComputePad(input_size_x,
+                                stride_x, weight_size_x, dilation_x,
+                                auto_pad_type,
+                                padding_left, padding_right));
+
+  return {static_cast<int32_t>(padding_top), static_cast<int32_t>(padding_left),
+          static_cast<int32_t>(padding_bottom), static_cast<int32_t>(padding_right)};
+}
+
+static void HandleAutoPad(const Shape& input_shape,
+                          const uint32_t weight_size_y,
+                          const uint32_t weight_size_x,
+                          const vector<int32_t>& onnx_strides,
+                          const vector<int32_t>& onnx_dilations,
+                          AutoPadType auto_pad_type,
+                          bool use_nchw,
+                          vector<int32_t>& onnx_pads,
+                          int32_t& nnapi_padding_code,
+                          bool& use_auto_pad) {
+  if (auto_pad_type != AutoPadType::NOTSET) {
+    onnx_pads = ComputeConvPads(input_shape, weight_size_y, weight_size_x,
+                                onnx_pads, onnx_strides, onnx_dilations,
+                                auto_pad_type, use_nchw);
+
+    if (AutoPadType::VALID == auto_pad_type || AutoPadType::SAME_UPPER == auto_pad_type) {
+      use_auto_pad = true;
+      nnapi_padding_code = (AutoPadType::VALID == auto_pad_type) ? ANEURALNETWORKS_PADDING_VALID
+                                                                 : ANEURALNETWORKS_PADDING_SAME;
+    }
+  } else {
+    const auto same_upper_pads = ComputeConvPads(input_shape, weight_size_y, weight_size_x,
+                                                 onnx_pads, onnx_strides, onnx_dilations,
+                                                 AutoPadType::SAME_UPPER, use_nchw);
+    if (onnx_pads == same_upper_pads) {
+      use_auto_pad = true;
+      nnapi_padding_code = ANEURALNETWORKS_PADDING_SAME;
+    }
+  }
+}
+
 #pragma endregion helpers

 #pragma region op_base
@ -765,11 +827,6 @@ bool PoolOpBuilder::IsOpSupportedImpl(ModelBuilder& /* model_builder */, const N
      return false;
    }

-    if (helper.Get("auto_pad", "NOTSET") != "NOTSET") {
-      LOGS_DEFAULT(VERBOSE) << "auto_pad is not supported";
-      return false;
-    }
-
    if (helper.Get("kernel_shape", std::vector<int32_t>{1, 1}).size() != 2) {
      LOGS_DEFAULT(VERBOSE) << "Only pooling 2d is supported";
      return false;
@ -841,33 +898,54 @@ void PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Nod
    op_type = ANEURALNETWORKS_MAX_POOL_2D;

  vector<int32_t> onnx_pads, onnx_strides, kernel_shape;
+  bool use_auto_pad = false;
+  int32_t nnapi_padding_code = ANEURALNETWORKS_PADDING_VALID;
+  const auto& input_shape = shaper[input];
  if (op == "AveragePool" || op == "MaxPool") {
+    const auto auto_pad_type = StringToAutoPadType(helper.Get("auto_pad", "NOTSET"));
    kernel_shape = helper.Get("kernel_shape", vector<int32_t>{0, 0});
    onnx_strides = helper.Get("strides", vector<int>{1, 1});
    onnx_pads = helper.Get("pads", vector<int>{0, 0, 0, 0});
+    const auto weight_size_y = static_cast<uint32_t>(kernel_shape[0]);
+    const auto weight_size_x = static_cast<uint32_t>(kernel_shape[1]);
+    HandleAutoPad(input_shape, weight_size_y, weight_size_x,
+                  onnx_strides, {1, 1} /* onnx_dilations */,
+                  auto_pad_type, use_nchw,
+                  onnx_pads, nnapi_padding_code, use_auto_pad);
  } else {  // (op == "GlobalAveragePool" || op == "GlobalMaxPool")
+    use_auto_pad = true;
+    nnapi_padding_code = ANEURALNETWORKS_PADDING_VALID;
    onnx_strides = vector<int32_t>{1, 1};
    onnx_pads = vector<int32_t>{0, 0, 0, 0};
-    if (model_builder.UseNCHW())
-      kernel_shape = vector<int32_t>{static_cast<int32_t>(shaper[input][2]),
-                                     static_cast<int32_t>(shaper[input][3])};
-    else
-      kernel_shape = vector<int32_t>{static_cast<int32_t>(shaper[input][1]),
-                                     static_cast<int32_t>(shaper[input][2])};
+    if (use_nchw) {
+      kernel_shape = vector<int32_t>{static_cast<int32_t>(input_shape[2]),
+                                     static_cast<int32_t>(input_shape[3])};
+    } else {
+      kernel_shape = vector<int32_t>{static_cast<int32_t>(input_shape[1]),
+                                     static_cast<int32_t>(input_shape[2])};
+    }
  }

  int32_t fuse_code = model_builder.FindActivation(node, *node.OutputDefs()[0]);
  std::vector<uint32_t> input_indices;
  input_indices.push_back(operand_indices.at(input));
-  input_indices.push_back(model_builder.AddOperandFromScalar(onnx_pads[1]));
-  input_indices.push_back(model_builder.AddOperandFromScalar(onnx_pads[3]));
-  input_indices.push_back(model_builder.AddOperandFromScalar(onnx_pads[0]));
-  input_indices.push_back(model_builder.AddOperandFromScalar(onnx_pads[2]));
+
+  if (use_auto_pad) {
+    input_indices.push_back(model_builder.AddOperandFromScalar(nnapi_padding_code));
+  } else {
+    input_indices.push_back(model_builder.AddOperandFromScalar(onnx_pads[1]));
+    input_indices.push_back(model_builder.AddOperandFromScalar(onnx_pads[3]));
+    input_indices.push_back(model_builder.AddOperandFromScalar(onnx_pads[0]));
+    input_indices.push_back(model_builder.AddOperandFromScalar(onnx_pads[2]));
+  }
+
  input_indices.push_back(model_builder.AddOperandFromScalar(onnx_strides[1]));
  input_indices.push_back(model_builder.AddOperandFromScalar(onnx_strides[0]));
  input_indices.push_back(model_builder.AddOperandFromScalar(kernel_shape[1]));
  input_indices.push_back(model_builder.AddOperandFromScalar(kernel_shape[0]));
  input_indices.push_back(model_builder.AddOperandFromScalar(fuse_code));
+
+  // TODO support API 28
  input_indices.push_back(model_builder.AddOperandFromScalar(use_nchw));

  shaper.Pool(input,
@ -899,10 +977,6 @@ void ConvOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod

 bool ConvOpBuilder::IsOpSupportedImpl(ModelBuilder& model_builder, const Node& node) {
  NodeAttrHelper helper(node);
-  if (helper.Get("auto_pad", "NOTSET") != "NOTSET") {
-    LOGS_DEFAULT(VERBOSE) << "SAME_LOWER auto_pad is not supported";
-    return false;
-  }

  const auto group = helper.Get("group", 1);
  const auto weight_name = node.InputDefs()[1]->Name();
@ -937,7 +1011,7 @@ void ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Nod

  // onnx pads are in the order top, left, bottom, right
  // while nnapi pads is in the order left, right, top, bottom
-  const auto onnx_pads = helper.Get("pads", vector<int>{0, 0, 0, 0});
+  auto onnx_pads = helper.Get("pads", vector<int>{0, 0, 0, 0});

  // onnx dilations is in the order height, width
  // while nnapi dilations are in the order width, height
@ -968,15 +1042,11 @@ void ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Nod
  const auto& weight_tensor = initializers.at(weight);
  bool depthwise_conv2d = (weight_tensor.dims()[1] == 1);

-  std::vector<uint32_t> input_indices;
-  input_indices.push_back(operand_indices.at(input));
-
+  // Pre-process weights
  if (conv2d) {
-    input_indices.push_back(AddInitializerInNewLayout(
-        model_builder, weight, L_0231));
+    AddInitializerInNewLayout(model_builder, weight, L_0231);
  } else {  // depthwise_conv2d
-    input_indices.push_back(AddInitializerInNewLayout(
-        model_builder, weight, L_1230));
+    AddInitializerInNewLayout(model_builder, weight, L_1230);
  }

  bool hasBias = (node.InputDefs().size() >= 3);
@ -1007,23 +1077,50 @@ void ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Nod
    }
  }

+  const auto auto_pad_type = StringToAutoPadType(helper.Get("auto_pad", "NOTSET"));
+  bool use_auto_pad = false;
+  int32_t nnapi_padding_code = ANEURALNETWORKS_PADDING_SAME;
+  const auto& input_shape = shaper[input];
+  const auto& kernel_shape = shaper[weight];
+  const auto weight_size_y = kernel_shape[1];
+  const auto weight_size_x = kernel_shape[2];
+  HandleAutoPad(input_shape, weight_size_y, weight_size_x,
+                onnx_strides, onnx_dilations,
+                auto_pad_type, use_nchw,
+                onnx_pads, nnapi_padding_code, use_auto_pad);
+
+  std::vector<uint32_t> input_indices;
+  input_indices.push_back(operand_indices.at(input));
+  input_indices.push_back(operand_indices.at(weight));
  input_indices.push_back(bias_idx_val);
-  input_indices.push_back(model_builder.AddOperandFromScalar(onnx_pads[1]));
-  input_indices.push_back(model_builder.AddOperandFromScalar(onnx_pads[3]));
-  input_indices.push_back(model_builder.AddOperandFromScalar(onnx_pads[0]));
-  input_indices.push_back(model_builder.AddOperandFromScalar(onnx_pads[2]));
+
+  if (use_auto_pad) {
+    input_indices.push_back(model_builder.AddOperandFromScalar(nnapi_padding_code));
+  } else {
+    input_indices.push_back(model_builder.AddOperandFromScalar(onnx_pads[1]));
+    input_indices.push_back(model_builder.AddOperandFromScalar(onnx_pads[3]));
+    input_indices.push_back(model_builder.AddOperandFromScalar(onnx_pads[0]));
+    input_indices.push_back(model_builder.AddOperandFromScalar(onnx_pads[2]));
+  }
+
  input_indices.push_back(model_builder.AddOperandFromScalar(onnx_strides[1]));
  input_indices.push_back(model_builder.AddOperandFromScalar(onnx_strides[0]));
+
  if (!conv2d && depthwise_conv2d) {
    int32_t depthwiseMultiplier = shaper[weight][3] / group;
    input_indices.push_back(model_builder.AddOperandFromScalar(depthwiseMultiplier));
  }
+
  int32_t fuse_code = model_builder.FindActivation(node, *node.OutputDefs()[0]);
  input_indices.push_back(model_builder.AddOperandFromScalar(fuse_code));
+
  // TODO support API 28
  input_indices.push_back(model_builder.AddOperandFromScalar(use_nchw));
-  input_indices.push_back(model_builder.AddOperandFromScalar(onnx_dilations[1]));
-  input_indices.push_back(model_builder.AddOperandFromScalar(onnx_dilations[0]));
+
+  if (onnx_dilations[1] != 1 || onnx_dilations[0] != 1) {
+    input_indices.push_back(model_builder.AddOperandFromScalar(onnx_dilations[1]));
+    input_indices.push_back(model_builder.AddOperandFromScalar(onnx_dilations[0]));
+  }

  int32_t operationCode;
  if (conv2d) {
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/shaper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/shaper.cc
@ -4,6 +4,9 @@
 #include "helper.h"
 #include "shaper.h"

+namespace onnxruntime {
+namespace nnapi {
+
 using std::string;
 using std::vector;

@ -45,8 +48,8 @@ void Shaper::Conv(const std::string& input_name,
                  const vector<int32_t>& onnx_dilations,
                  bool nchw,
                  const std::string& output_name) {
-  Shape input_dimen = shape_map_.at(input_name);
-  Shape weight_dimen = shape_map_.at(weight_name);  // num_output, height, width, num_input
+  const Shape& input_dimen = shape_map_.at(input_name);
+  const Shape& weight_dimen = shape_map_.at(weight_name);  // num_output, height, width, num_input

  const auto input_size_y = nchw ? input_dimen[2] : input_dimen[1];
  const auto input_size_x = nchw ? input_dimen[3] : input_dimen[2];
@ -88,8 +91,8 @@ void Shaper::DepthwiseConv(const std::string& input_name,
                           const std::vector<int32_t>& onnx_dilations,
                           bool nchw,
                           const std::string& output_name) {
-  Shape input_dimen = shape_map_.at(input_name);
-  Shape weight_dimen = shape_map_.at(weight_name);  // 1, height, width, num_output
+  const Shape& input_dimen = shape_map_.at(input_name);
+  const Shape& weight_dimen = shape_map_.at(weight_name);  // 1, height, width, num_output

  const auto input_size_y = nchw ? input_dimen[2] : input_dimen[1];
  const auto input_size_x = nchw ? input_dimen[3] : input_dimen[2];
@ -130,11 +133,11 @@ void Shaper::Pool(const std::string& input_name,
                  const std::vector<int32_t>& kernel_shape,
                  bool nchw,
                  const std::string& output_name) {
-  auto input_dimen = shape_map_.at(input_name);
+  const Shape& input_dimen = shape_map_.at(input_name);
  const auto input_size_y = nchw ? input_dimen[2] : input_dimen[1];
  const auto input_size_x = nchw ? input_dimen[3] : input_dimen[2];
-  int32_t weight_size_y = kernel_shape[0];
-  int32_t weight_size_x = kernel_shape[1];
+  const auto weight_size_y = kernel_shape[0];
+  const auto weight_size_x = kernel_shape[1];

  uint32_t output_size_y, output_size_x;
  std::tie(output_size_y, output_size_x) =
@ -167,7 +170,7 @@ void Shaper::Pool(const std::string& input_name,
 void Shaper::Reshape(const std::string& input_name,
                     const std::vector<int32_t>& shape,
                     const std::string& output_name) {
-  auto input_dimen = shape_map_.at(input_name);
+  const Shape& input_dimen = shape_map_.at(input_name);
  int64_t input_size = Product(input_dimen);
  std::vector<uint32_t> output_dimen(shape.size());

@ -209,7 +212,7 @@ void Shaper::Reshape(const std::string& input_name,
 void Shaper::Transpose(const std::string& input_name,
                       const std::vector<int32_t>& perm,
                       const std::string& output_name) {
-  auto input_dimen = shape_map_.at(input_name);
+  const Shape& input_dimen = shape_map_.at(input_name);

  ORT_ENFORCE(perm.size() == input_dimen.size(), "Invalid perm is given!");

@ -231,8 +234,8 @@ void Shaper::Transpose(const std::string& input_name,
 void Shaper::Eltwise(const std::string& input1_name,
                     const std::string& input2_name,
                     const std::string& output_name) {
-  auto& shape1 = shape_map_.at(input1_name);
-  auto& shape2 = shape_map_.at(input2_name);
+  const Shape& shape1 = shape_map_.at(input1_name);
+  const Shape& shape2 = shape_map_.at(input2_name);

  // broadcasting support
  bool shape1IsBigger = shape1.size() >= shape2.size();
@ -283,8 +286,8 @@ void Shaper::Identity(const std::string& input_name,
 void Shaper::FC(const std::string& input1_name, const std::string& input2_name,
                const std::string& output_name) {
  // Currently we only support A*B'+C
-  auto input1_dimen = shape_map_.at(input1_name);
-  Shape input2_dimen = shape_map_.at(input2_name);  // num_units, input_size
+  const Shape& input1_dimen = shape_map_.at(input1_name);
+  const Shape& input2_dimen = shape_map_.at(input2_name);  // num_units, input_size
  Shape output_dimen{input1_dimen[0], input2_dimen[0]};
  shape_map_[output_name] = output_dimen;

@ -301,7 +304,7 @@ void Shaper::Concat(const std::vector<std::string>& input_names,
                    const std::string& output_name) {
  std::vector<Shape> dimens;
  for (const auto& input_name : input_names) {
-    auto& dimen = shape_map_.at(input_name);
+    const Shape& dimen = shape_map_.at(input_name);
    if (!dimens.empty()) {
      for (size_t i = 0; i < dimens[0].size(); i++) {
        if ((int32_t)i == axis)
@ -332,7 +335,7 @@ void Shaper::Concat(const std::vector<std::string>& input_names,
 void Shaper::Squeeze(const std::string& input_name,
                     const std::vector<int32_t>& axes,
                     const std::string& output_name) {
-  std::vector<uint32_t> input_dimen = shape_map_.at(input_name);
+  const Shape& input_dimen = shape_map_.at(input_name);
  int32_t input_size = input_dimen.size();
  size_t axes_size = axes.size();
  std::unordered_set<int32_t> axes_to_be_squeezed;
@ -372,7 +375,7 @@ void Shaper::UpdateShape(const std::string& name, const Shape& new_shape) {
  ORT_ENFORCE(shaper_finalized_,
              "Cannot UpdateShape while shaper is not finalized");

-  const auto& old_shape = shape_map_.at(name);
+  const Shape& old_shape = shape_map_.at(name);
  if (old_shape != new_shape) {
    if (Product(old_shape) != 0)
      ORT_THROW("The shape should be same size or old shape has size 0 (dynamic shape)");
@ -404,3 +407,6 @@ std::string Shape2String(const Shaper::Shape& shape) {
  os << "]";
  return os.str();
 }
+
+}  // namespace nnapi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/shaper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/shaper.h
@ -4,6 +4,9 @@
 #include <unordered_map>
 #include <vector>

+namespace onnxruntime {
+namespace nnapi {
+
 class Shaper {
 public:
  using Shape = std::vector<uint32_t>;
@ -69,3 +72,6 @@ class Shaper {
 };

 std::string Shape2String(const Shaper::Shape& shape);
+
+}  // namespace nnapi
+}  // namespace onnxruntime