From d83f7fd4aa176f56d9093e19a1db13399aac0c8a Mon Sep 17 00:00:00 2001
From: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com>
Date: Thu, 17 Jun 2021 12:36:12 -0700
Subject: [PATCH] [NNAPI EP] Enable Slice support (#8031)

* Enable slice for NNAPI EP

* Add ANEURALNETWORKS_STRIDED_SLICE support

* Addressed CR comments

* Addressed CR comments, rename PrepareForCompute to PrepareForComputeHelper to avoid confusion
---
 .../core/providers/cpu/tensor/slice.cc        | 121 +--------
 onnxruntime/core/providers/cpu/tensor/slice.h |  23 +-
 .../cpu/tensor/slice_compute_metadata.h       |  33 +++
 .../core/providers/cpu/tensor/slice_helper.h  | 141 +++++++++++
 .../nnapi/nnapi_builtin/builders/helper.cc    |   4 +-
 .../nnapi/nnapi_builtin/builders/helper.h     |   4 +-
 .../nnapi_builtin/builders/op_builder.cc      | 229 +++++++++++++++---
 .../builders/op_support_checker.cc            | 170 +++++++++----
 .../providers/cpu/tensor/slice_op.test.cc     |  43 +++-
 9 files changed, 543 insertions(+), 225 deletions(-)
 create mode 100644 onnxruntime/core/providers/cpu/tensor/slice_compute_metadata.h
 create mode 100644 onnxruntime/core/providers/cpu/tensor/slice_helper.h
diff --git a/onnxruntime/core/providers/cpu/tensor/slice.cc b/onnxruntime/core/providers/cpu/tensor/slice.cc
index 0557598732..e8b1fa5d6a 100644
--- a/onnxruntime/core/providers/cpu/tensor/slice.cc
+++ b/onnxruntime/core/providers/cpu/tensor/slice.cc
@@ -8,6 +8,7 @@
 
 #include "core/framework/element_type_lists.h"
 #include "core/providers/common.h"
+#include "core/providers/cpu/tensor/slice_helper.h"
 #include "core/providers/cpu/tensor/utils.h"
 #include "core/providers/op_kernel_type_control.h"
 #include "core/providers/op_kernel_type_control_utils.h"
@@ -44,14 +45,6 @@ const auto data_type_constraints = BuildKernelDefConstraintsFromTypeList<DataTyp
 const auto indices_type_constraints = BuildKernelDefConstraintsFromTypeList<IndicesTypes>();
 const auto enabled_data_type_constraints = BuildKernelDefConstraintsFromTypeList<EnabledDataTypes>();
 const auto enabled_indices_type_constraints = BuildKernelDefConstraintsFromTypeList<EnabledIndicesTypes>();
-
-// std::clamp doesn't exist until C++17 so create a local version
-template <typename T>
-const T& clamp(const T& v, const T& lo, const T& hi) {
-  if (v < lo) return lo;
-  if (v > hi) return hi;
-  return v;
-}
 }  // namespace
 
 ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
@@ -93,6 +86,7 @@ ONNX_CPU_OPERATOR_KERNEL(
 static void FlattenOutputDims(const std::vector<int64_t>& input_dimensions,
                               const std::vector<int64_t>& output_dims,
                               std::vector<int64_t>& starts,
+                              std::vector<int64_t>& ends,
                               std::vector<int64_t>& steps,
                               std::vector<int64_t>*& flattened_output_dims) {
   int num_to_combine = 0;
@@ -120,6 +114,10 @@ static void FlattenOutputDims(const std::vector<int64_t>& input_dimensions,
     // so we can just shrink via resize so the number of entries matches flattened_output_dims
     starts.resize(num_dims);
     steps.resize(num_dims);
+
+    // update ends as well
+    ends.resize(num_dims);
+    ends.back() = dim_value;
   } else {
     flattened_output_dims = nullptr;
   }
@@ -130,47 +128,9 @@ Status SliceBase::PrepareForCompute(const std::vector<int64_t>& raw_starts,
                                     const std::vector<int64_t>& raw_ends,
                                     const std::vector<int64_t>& raw_axes,
                                     SliceOp::PrepareForComputeMetadata& compute_metadata) {
-  // Initialize axes to the provided axes attribute or to the default sequence
-  std::vector<int64_t> axes(raw_axes);
-  if (axes.empty()) {
-    //axes are omitted, they are set to[0, ..., ndim - 1]
-    axes.resize(compute_metadata.starts_.size());
-    std::iota(axes.begin(), axes.end(), 0);
-  }
-
-  // Iterate through the provided axes and override the start/end ranges
-  std::unordered_set<int64_t> unique_axes;
-  const auto& dimension_count = compute_metadata.input_dimensions_.size();
-  for (size_t axis_index = 0, axes_count = axes.size(); axis_index < axes_count; ++axis_index) {
-    auto axis = HandleNegativeAxis(axes[axis_index], dimension_count);  // handle negative and enforce axis is valid
-    if (axis >= static_cast<int64_t>(dimension_count) || axis < 0)
-      return Status(ONNXRUNTIME, INVALID_ARGUMENT, "'axes' has an axis outside of the tensor dimension count");
-    if (unique_axes.find(axis) != unique_axes.end())
-      return Status(ONNXRUNTIME, INVALID_ARGUMENT, "'axes' has duplicates");
-    unique_axes.insert(axis);
-
-    // process start
-    auto start = raw_starts[axis_index];
-    if (start < 0)
-      start += compute_metadata.input_dimensions_[axis];
-    compute_metadata.starts_[axis] = clamp(start, int64_t{0}, compute_metadata.input_dimensions_[axis]);
-
-    // process end
-    auto end = raw_ends[axis_index];
-    if (end < 0)
-      end += compute_metadata.input_dimensions_[axis];
-
-    // find output dim value for this axis
-    auto temp = clamp(end, int64_t{0}, compute_metadata.input_dimensions_[axis]) - compute_metadata.starts_[axis];
-    if (temp < 0)
-      compute_metadata.output_dims_[axis] = 0;
-    else
-      compute_metadata.output_dims_[axis] = temp;
-  }
-
+  ORT_RETURN_IF_ERROR(SliceOp::PrepareForComputeHelper(raw_starts, raw_ends, raw_axes, compute_metadata));
   FlattenOutputDims(compute_metadata.input_dimensions_, compute_metadata.output_dims_, compute_metadata.starts_,
-                    compute_metadata.steps_, compute_metadata.p_flattened_output_dims_);
-
+                    compute_metadata.ends_, compute_metadata.steps_, compute_metadata.p_flattened_output_dims_);
   return Status::OK();
 }
 
@@ -180,70 +140,9 @@ Status SliceBase::PrepareForCompute(const std::vector<int64_t>& raw_starts,
                                     const std::vector<int64_t>& raw_axes,
                                     const std::vector<int64_t>& raw_steps,
                                     SliceOp::PrepareForComputeMetadata& compute_metadata) {
-  // Initialize axes to the provided axes attribute or to the default sequence
-  std::vector<int64_t> axes(raw_axes);
-
-  if (axes.empty()) {
-    // axes are omitted, they are set to[0, ..., ndim - 1]
-    axes.resize(compute_metadata.starts_.size());
-    std::iota(axes.begin(), axes.end(), 0);
-  }
-
-  // Iterate through the provided axes and override the start/end/steps ranges
-  std::unordered_set<int64_t> unique_axes;
-  const auto& dimension_count = compute_metadata.input_dimensions_.size();
-  for (size_t axis_index = 0, axes_count = axes.size(); axis_index < axes_count; ++axis_index) {
-    auto axis = axes[axis_index] < 0 ? axes[axis_index] + static_cast<int64_t>(dimension_count) : axes[axis_index];
-    if (axis >= static_cast<int64_t>(dimension_count) || axis < 0)
-      return Status(ONNXRUNTIME, INVALID_ARGUMENT, "'axes' has an axis outside of the tensor dimension count");
-    if (unique_axes.find(axis) != unique_axes.end())
-      return Status(ONNXRUNTIME, INVALID_ARGUMENT, "'axes' has duplicates");
-    unique_axes.insert(axis);
-
-    // process step
-    auto step = axis_index < raw_steps.size() ? raw_steps[axis_index] : 1;
-    if (step == 0)
-      return Status(ONNXRUNTIME, INVALID_ARGUMENT, "'step' value cannot be 0");
-    compute_metadata.steps_[axis] = step;
-
-    // process start
-    auto start = raw_starts[axis_index];
-    if (start < 0)
-      start += compute_metadata.input_dimensions_[axis];
-    if (step < 0)
-      compute_metadata.starts_[axis] = clamp(start, int64_t{0}, compute_metadata.input_dimensions_[axis] - 1);
-    else
-      compute_metadata.starts_[axis] = clamp(start, int64_t{0}, compute_metadata.input_dimensions_[axis]);
-
-    // process end
-    auto end = raw_ends[axis_index];
-    // INT_MAX has a special meaning for end according to spec
-    // equivalent to 'None' in numpy
-    // it represent slicing to the end of the dimension
-    if (end == std::numeric_limits<int32_t>::max() ||
-        end == std::numeric_limits<int64_t>::max()) {
-      end = step < 0 ? -1 : compute_metadata.input_dimensions_[axis];
-    }
-
-    else {
-      if (end < 0)
-        end += compute_metadata.input_dimensions_[axis];
-      if (step < 0)
-        end = clamp(end, int64_t{-1}, compute_metadata.input_dimensions_[axis]);
-      else
-        end = clamp(end, int64_t{0}, compute_metadata.input_dimensions_[axis]);
-    }
-
-    // find output dim value for this axis
-    auto temp = static_cast<int64_t>(ceil(1.0 * (end - compute_metadata.starts_[axis]) / step));
-    if (temp < 0)
-      compute_metadata.output_dims_[axis] = 0;
-    else
-      compute_metadata.output_dims_[axis] = temp;
-  }
-
+  ORT_RETURN_IF_ERROR(SliceOp::PrepareForComputeHelper(raw_starts, raw_ends, raw_axes, raw_steps, compute_metadata));
   FlattenOutputDims(compute_metadata.input_dimensions_, compute_metadata.output_dims_, compute_metadata.starts_,
-                    compute_metadata.steps_, compute_metadata.p_flattened_output_dims_);
+                    compute_metadata.ends_, compute_metadata.steps_, compute_metadata.p_flattened_output_dims_);
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/cpu/tensor/slice.h b/onnxruntime/core/providers/cpu/tensor/slice.h
index 4e52777fd1..f9549257ca 100644
--- a/onnxruntime/core/providers/cpu/tensor/slice.h
+++ b/onnxruntime/core/providers/cpu/tensor/slice.h
@@ -1,5 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+#pragma once
 
 #ifndef SHARED_PROVIDER
 #include "core/common/common.h"
@@ -7,28 +8,10 @@
 #include "core/util/math_cpuonly.h"
 #endif
 
+#include "core/providers/cpu/tensor/slice_compute_metadata.h"
+
 namespace onnxruntime {
 
-namespace SliceOp {
-struct PrepareForComputeMetadata {
-  PrepareForComputeMetadata() = delete;
-  PrepareForComputeMetadata(const std::vector<int64_t>& input_dimensions)
-      : input_dimensions_(input_dimensions) {
-    size_t dimension_count = input_dimensions.size();
-    starts_.resize(dimension_count, 0);
-    steps_.resize(dimension_count, 1);
-    output_dims_ = input_dimensions;
-  }
-
-  const std::vector<int64_t>& input_dimensions_;
-  std::vector<int64_t> starts_;
-  std::vector<int64_t> steps_;
-  std::vector<int64_t> output_dims_;
-  std::vector<int64_t> flattened_output_dims_;
-  std::vector<int64_t>* p_flattened_output_dims_ = &flattened_output_dims_;
-};
-}  // namespace SliceOp
-
 class SliceBase {
   // static methods that can be used from other ops if needed
  public:
diff --git a/onnxruntime/core/providers/cpu/tensor/slice_compute_metadata.h b/onnxruntime/core/providers/cpu/tensor/slice_compute_metadata.h
new file mode 100644
index 0000000000..0eb37124ce
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/tensor/slice_compute_metadata.h
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// This file contains the definition of the PrepareForComputeMetadata for Slice operator
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+namespace onnxruntime {
+
+namespace SliceOp {
+struct PrepareForComputeMetadata {
+  explicit PrepareForComputeMetadata(const std::vector<int64_t>& input_dimensions)
+      : input_dimensions_(input_dimensions),
+        ends_(input_dimensions),
+        output_dims_(input_dimensions) {
+    size_t dimension_count = input_dimensions.size();
+    starts_.resize(dimension_count, 0);
+    steps_.resize(dimension_count, 1);
+  }
+
+  const std::vector<int64_t>& input_dimensions_;
+  std::vector<int64_t> starts_;
+  std::vector<int64_t> ends_;
+  std::vector<int64_t> steps_;
+  std::vector<int64_t> output_dims_;
+  std::vector<int64_t> flattened_output_dims_;
+  std::vector<int64_t>* p_flattened_output_dims_ = &flattened_output_dims_;
+};
+
+}  // namespace SliceOp
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/tensor/slice_helper.h b/onnxruntime/core/providers/cpu/tensor/slice_helper.h
new file mode 100644
index 0000000000..410c9d2c37
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/tensor/slice_helper.h
@@ -0,0 +1,141 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// This file contains the functions compute the starts, steps (strides) and output shape
+// for Slice op, which can be called from other ops or EPs.
+#pragma once
+#include "core/providers/cpu/tensor/slice_compute_metadata.h"
+
+namespace onnxruntime {
+
+// std::clamp doesn't exist until C++17 so create a local version
+template <typename T>
+const T& clamp(const T& v, const T& lo, const T& hi) {
+  if (v < lo) return lo;
+  if (v > hi) return hi;
+  return v;
+}
+
+namespace SliceOp {
+// compute output_dims without steps (Slice V1-9 & DynamicSlice)
+// Please note this will not Flatten the output shape
+inline Status PrepareForComputeHelper(const std::vector<int64_t>& raw_starts,
+                                      const std::vector<int64_t>& raw_ends,
+                                      const std::vector<int64_t>& raw_axes,
+                                      SliceOp::PrepareForComputeMetadata& compute_metadata) {
+  // Initialize axes to the provided axes attribute or to the default sequence
+  std::vector<int64_t> axes(raw_axes);
+  if (axes.empty()) {
+    //axes are omitted, they are set to[0, ..., ndim - 1]
+    axes.resize(compute_metadata.starts_.size());
+    std::iota(axes.begin(), axes.end(), 0);
+  }
+
+  // Iterate through the provided axes and override the start/end ranges
+  std::unordered_set<int64_t> unique_axes;
+  const auto& dimension_count = compute_metadata.input_dimensions_.size();
+  for (size_t axis_index = 0, axes_count = axes.size(); axis_index < axes_count; ++axis_index) {
+    auto axis = HandleNegativeAxis(axes[axis_index], dimension_count);  // handle negative and enforce axis is valid
+    if (axis >= static_cast<int64_t>(dimension_count) || axis < 0)
+      return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "'axes' has an axis outside of the tensor dimension count");
+    if (unique_axes.find(axis) != unique_axes.end())
+      return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "'axes' has duplicates");
+    unique_axes.insert(axis);
+
+    // process start
+    auto start = raw_starts[axis_index];
+    if (start < 0)
+      start += compute_metadata.input_dimensions_[axis];
+    compute_metadata.starts_[axis] = clamp(start, int64_t{0}, compute_metadata.input_dimensions_[axis]);
+
+    // process end
+    auto end = raw_ends[axis_index];
+    if (end < 0)
+      end += compute_metadata.input_dimensions_[axis];
+    compute_metadata.ends_[axis] = clamp(end, int64_t{0}, compute_metadata.input_dimensions_[axis]);
+
+    // find output dim value for this axis
+    auto temp = compute_metadata.ends_[axis] - compute_metadata.starts_[axis];
+    if (temp < 0)
+      compute_metadata.output_dims_[axis] = 0;
+    else
+      compute_metadata.output_dims_[axis] = temp;
+  }
+
+  return Status::OK();
+}
+
+// compute output_dims with steps (Slice V10)
+// Please note this will not Flatten the output shape
+inline Status PrepareForComputeHelper(const std::vector<int64_t>& raw_starts,
+                                      const std::vector<int64_t>& raw_ends,
+                                      const std::vector<int64_t>& raw_axes,
+                                      const std::vector<int64_t>& raw_steps,
+                                      SliceOp::PrepareForComputeMetadata& compute_metadata) {
+  // Initialize axes to the provided axes attribute or to the default sequence
+  std::vector<int64_t> axes(raw_axes);
+
+  if (axes.empty()) {
+    // axes are omitted, they are set to[0, ..., ndim - 1]
+    axes.resize(compute_metadata.starts_.size());
+    std::iota(axes.begin(), axes.end(), 0);
+  }
+
+  // Iterate through the provided axes and override the start/end/steps ranges
+  std::unordered_set<int64_t> unique_axes;
+  const auto& dimension_count = compute_metadata.input_dimensions_.size();
+  for (size_t axis_index = 0, axes_count = axes.size(); axis_index < axes_count; ++axis_index) {
+    auto axis = axes[axis_index] < 0 ? axes[axis_index] + static_cast<int64_t>(dimension_count) : axes[axis_index];
+    if (axis >= static_cast<int64_t>(dimension_count) || axis < 0)
+      return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "'axes' has an axis outside of the tensor dimension count");
+    if (unique_axes.find(axis) != unique_axes.end())
+      return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "'axes' has duplicates");
+    unique_axes.insert(axis);
+
+    // process step
+    auto step = axis_index < raw_steps.size() ? raw_steps[axis_index] : 1;
+    if (step == 0)
+      return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "'step' value cannot be 0");
+    compute_metadata.steps_[axis] = step;
+
+    // process start
+    auto start = raw_starts[axis_index];
+    if (start < 0)
+      start += compute_metadata.input_dimensions_[axis];
+    if (step < 0)
+      compute_metadata.starts_[axis] = clamp(start, int64_t{0}, compute_metadata.input_dimensions_[axis] - 1);
+    else
+      compute_metadata.starts_[axis] = clamp(start, int64_t{0}, compute_metadata.input_dimensions_[axis]);
+
+    // process end
+    auto end = raw_ends[axis_index];
+    // INT_MAX has a special meaning for end according to spec
+    // equivalent to 'None' in numpy
+    // it represent slicing to the end of the dimension
+    if (end == std::numeric_limits<int32_t>::max() ||
+        end == std::numeric_limits<int64_t>::max()) {
+      end = step < 0 ? -1 : compute_metadata.input_dimensions_[axis];
+    } else {
+      if (end < 0)
+        end += compute_metadata.input_dimensions_[axis];
+      if (step < 0)
+        end = clamp(end, int64_t{-1}, compute_metadata.input_dimensions_[axis]);
+      else
+        end = clamp(end, int64_t{0}, compute_metadata.input_dimensions_[axis]);
+    }
+
+    compute_metadata.ends_[axis] = end;
+
+    // find output dim value for this axis
+    auto temp = static_cast<int64_t>(ceil(1.0 * (compute_metadata.ends_[axis] - compute_metadata.starts_[axis]) / step));
+    if (temp < 0)
+      compute_metadata.output_dims_[axis] = 0;
+    else
+      compute_metadata.output_dims_[axis] = temp;
+  }
+
+  return Status::OK();
+}
+
+}  // namespace SliceOp
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
index 868d1f5d73..4646d1cb08 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -539,8 +539,8 @@ std::string Shape2String(const std::vector<uint32_t>& shape) {
   return os.str();
 }
 
-bool CheckIsInitializerTensor(const InitializedTensorSet& initializers, const Node& node,
-                              size_t input_idx, const char* input_name) {
+bool CheckIsInitializer(const InitializedTensorSet& initializers, const Node& node,
+                        size_t input_idx, const char* input_name) {
   if (!Contains(initializers, node.InputDefs()[input_idx]->Name())) {
     LOGS_DEFAULT(VERBOSE) << input_name << " of " << node.OpType() << " must be an initializer tensor";
     return false;
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
index 277025ebf0..71ff61d3e7 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -133,8 +133,8 @@ std::vector<std::vector<size_t>> GetSupportedNodes(const GraphViewer& graph_view
 std::string Shape2String(const std::vector<uint32_t>& shape);
 
 // Check the given input is an initializer tensor
-bool CheckIsInitializerTensor(const InitializedTensorSet& initializers, const Node& node,
-                              size_t index, const char* input_name) ORT_MUST_USE_RESULT;
+bool CheckIsInitializer(const InitializedTensorSet& initializers, const Node& node,
+                        size_t index, const char* input_name) ORT_MUST_USE_RESULT;
 
 }  // namespace nnapi
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index 6368ee2a5b..9e640f56b3 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -8,6 +8,7 @@
 #include <onnx/onnx_pb.h>
 
 #include "core/providers/shared/utils/utils.h"
+#include "core/providers/cpu/tensor/slice_helper.h"
 #include "helper.h"
 #include "model_builder.h"
 #include "op_builder.h"
@@ -2536,7 +2537,6 @@ Status MinMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 #pragma region op_elu
 
 class EluOpBuilder : public BaseOpBuilder {
- public:
  private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node) const override ORT_MUST_USE_RESULT;
 };
@@ -2561,6 +2561,179 @@ Status EluOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const No
 
 #pragma endregion
 
+#pragma region op_slice
+
+class SliceOpBuilder : public BaseOpBuilder {
+ public:
+  void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
+
+ private:
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node) const override ORT_MUST_USE_RESULT;
+};
+
+void SliceOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
+  // Skip everything except input0 for Slice
+  const auto input_defs = node.InputDefs();
+  model_builder.AddInitializerToSkip(input_defs[1]->Name());  // starts
+  model_builder.AddInitializerToSkip(input_defs[2]->Name());  // ends
+  if (input_defs.size() > 3) {
+    model_builder.AddInitializerToSkip(input_defs[3]->Name());  // axes
+    if (input_defs.size() > 4) {
+      model_builder.AddInitializerToSkip(input_defs[4]->Name());  // steps
+    }
+  }
+}
+
+Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node) const {
+  auto& shaper(model_builder.GetShaper());
+  const auto& operand_indices(model_builder.GetOperandIndices());
+  const auto& operand_types(model_builder.GetOperandTypes());
+  const auto input_defs = node.InputDefs();
+  const auto& input_shape = shaper[input_defs[0]->Name()];
+  std::vector<int64_t> input_shape_64(input_shape.cbegin(), input_shape.cend());
+  SliceOp::PrepareForComputeMetadata compute_metadata(input_shape_64);
+
+  {
+    // We need to copy the data from the starts/ends/axes/steps initializers to int64 vectors
+    // to be used in shared PrepareForCompute function to calculate the output shape
+    // and normalize inputs, for example, input can be starts/ends/steps for certain axes,
+    // PrepareForCompute can generate standard starts/ends/steps/axes for each axes
+    std::vector<int64_t> input_starts;
+    std::vector<int64_t> input_ends;
+    std::vector<int64_t> input_axes;
+    std::vector<int64_t> input_steps;
+
+    const auto CopyInputData = [&node, &model_builder](size_t input_idx, std::vector<int64_t>& data) {
+      data.clear();
+      const auto input_defs = node.InputDefs();
+
+      // This is an optional input, return empty vector
+      if (input_defs.size() <= input_idx)
+        return Status::OK();
+
+      const auto& input_name = input_defs[input_idx]->Name();
+      const auto& initializers(model_builder.GetInitializerTensors());
+
+      const auto& tensor = *initializers.at(input_name);
+      std::unique_ptr<uint8_t[]> unpacked_tensor;
+      size_t tensor_byte_size;
+      ORT_RETURN_IF_ERROR(
+          onnxruntime::utils::UnpackInitializerData(tensor, model_builder.GetGraphViewer().ModelPath(),
+                                                    unpacked_tensor, tensor_byte_size));
+      const auto data_type = tensor.data_type();
+      if (data_type == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+        const int64_t* tensor_data = reinterpret_cast<const int64_t*>(unpacked_tensor.get());
+        size_t size = tensor_byte_size / sizeof(int64_t);
+        data.insert(data.end(), tensor_data, tensor_data + size);
+      } else if (data_type == ONNX_NAMESPACE::TensorProto_DataType_INT32) {
+        const int32_t* tensor_data = reinterpret_cast<const int32_t*>(unpacked_tensor.get());
+        size_t size = tensor_byte_size / sizeof(int32_t);
+        data.insert(data.end(), tensor_data, tensor_data + size);
+      } else {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                               "Data type for starts and ends inputs' is not supported in this build. Got ",
+                               data_type);
+      }
+
+      return Status::OK();
+    };
+
+    ORT_RETURN_IF_ERROR(CopyInputData(1, input_starts));
+    ORT_RETURN_IF_ERROR(CopyInputData(2, input_ends));
+    ORT_RETURN_IF_ERROR(CopyInputData(3, input_axes));
+    ORT_RETURN_IF_ERROR(CopyInputData(4, input_steps));
+    ORT_RETURN_IF_ERROR(
+        SliceOp::PrepareForComputeHelper(input_starts, input_ends, input_axes, input_steps, compute_metadata));
+  }
+
+  // output shape is of type uint32_t, convert from int64 compute_metadata.output_dims_
+  Shape nnapi_output_shape;
+  nnapi_output_shape.reserve(compute_metadata.output_dims_.size());
+  std::transform(compute_metadata.output_dims_.cbegin(), compute_metadata.output_dims_.cend(),
+                 std::back_inserter(nnapi_output_shape),
+                 [](int64_t i) { return SafeInt<uint32_t>(i); });
+
+  const auto& input = node.InputDefs()[0]->Name();
+  const auto& output = node.OutputDefs()[0]->Name();
+  bool output_is_nhwc = model_builder.IsOperandNHWC(input);
+
+  // No shape inference for Slice, everything is calculated here, we only need to add the output shape
+  // to the shaper
+  shaper.AddShape(output, nnapi_output_shape);
+  const OperandType output_operand_type(operand_types.at(input).type, shaper[output]);
+
+  std::vector<uint32_t> input_indices;
+  input_indices.push_back(operand_indices.at(input));
+
+  // begin/end/strides of ANEURALNETWORKS_STRIDED_SLICE have the same shape
+  Shape param_dimen = {static_cast<uint32_t>(input_shape.size())};
+
+  // helper function to add begin/end/strides of ANEURALNETWORKS_STRIDED_SLICE
+  const auto AddOperand = [&model_builder, &node, &input_indices, &operand_indices](
+                              const char* name, const Shape& shape, const std::vector<int64_t>& param_raw_data) {
+    std::vector<int32_t> param_data;
+    param_data.reserve(param_raw_data.size());
+    std::transform(param_raw_data.cbegin(), param_raw_data.cend(),
+                   std::back_inserter(param_data),
+                   [](int64_t i) { return SafeInt<int32_t>(i); });
+    std::string param_name = model_builder.GetUniqueName(node.Name() + name);
+    OperandType param_operand_type(Type::TENSOR_INT32, shape);
+    ORT_RETURN_IF_ERROR(
+        model_builder.AddOperandFromPersistMemoryBuffer(param_name, param_data.data(), param_operand_type));
+    input_indices.push_back(operand_indices.at(param_name));
+    return Status::OK();
+  };
+
+  ORT_RETURN_IF_ERROR(AddOperand("starts", param_dimen, compute_metadata.starts_));  //nnapi_begin
+
+  // NNAPI has 2 slice operations
+  // - ANEURALNETWORKS_SLICE
+  //    Simpler and faster version of slice without steps, available from ANEURALNETWORKS_FEATURE_LEVEL_3
+  //    Use this one if no step other than 1 is used in ONNX slice
+  // - ANEURALNETWORKS_STRIDED_SLICE
+  //    More comprehensive version, available from ANEURALNETWORKS_FEATURE_LEVEL_2
+  int op_code = ANEURALNETWORKS_STRIDED_SLICE;
+  if (std::all_of(compute_metadata.steps_.cbegin(),
+                  compute_metadata.steps_.cend(),
+                  [](int64_t i) { return i == 1; }) &&
+      model_builder.GetNNAPIFeatureLevel() > ANEURALNETWORKS_FEATURE_LEVEL_2) {
+    op_code = ANEURALNETWORKS_SLICE;
+    // the nnapi size of the slice in this case is the output shape
+    ORT_RETURN_IF_ERROR(AddOperand("sizes", param_dimen, compute_metadata.output_dims_));  //nnapi_sizes
+  } else {
+    // ** The special treatment of ends **
+    // The nnapi_end need some special handling, based on the current undocumented design of
+    // ANEURALNETWORKS_STRIDED_SLICE
+    // For ORT, for a single axis, after SliceOp::PrepareForCompute, and the step is negative,
+    // and the last element for slice is at the beginning of the axis (we are slicing backwards)
+    // The end for this axis will be -1
+    // For NNAPI, it is not documented that end can be negative,
+    // see https://developer.android.com/ndk/reference/group/neural-networks#group___neural_networks_1ggaabbe492c60331b13038e39d4207940e0a89695302f8b1e7ae7ce8f4d8c0b8a752
+    // However, the actual NNAPI StridedSlice has some odd implementations,
+    // See https://android.googlesource.com/platform/frameworks/ml/+/5b525d4d9100819d87447bd2c2a0bcfdd62899ee/nn/common/operations/StridedSlice.cpp#177
+    // and, https://android.googlesource.com/platform/frameworks/ml/+/5b525d4d9100819d87447bd2c2a0bcfdd62899ee/nn/common/include/OperationsUtils.h#262
+    // If a negative end is no less than -dim (dimension of the axis), it will be treated as an index counting from
+    // the end, for example, dim = 5, and end = -1, the end will be normalized to 4, which will cause
+    // incorrect result, so here we have to make the end = -dim - 1 such that it will not be treated as
+    // an index counting from the end.
+    std::vector<int64_t> ends = compute_metadata.ends_;
+    for (size_t i = 0; i < ends.size(); ++i) {
+      if (ends[i] == -1) {
+        ends[i] = -static_cast<int32_t>(input_shape[i] + 1);
+      }
+    }
+    ORT_RETURN_IF_ERROR(AddOperand("ends", param_dimen, ends));                      //nnapi_end
+    ORT_RETURN_IF_ERROR(AddOperand("steps", param_dimen, compute_metadata.steps_));  //nnapi_strides
+    // We do not use the following inputs in ANEURALNETWORKS_STRIDED_SLICE, set them all to 0
+    ADD_SCALAR_OPERAND(model_builder, input_indices, 0);  // begin_mask
+    ADD_SCALAR_OPERAND(model_builder, input_indices, 0);  // end_mask
+    ADD_SCALAR_OPERAND(model_builder, input_indices, 0);  // shrink_axis_mask
+  }
+  return model_builder.AddOperation(op_code, input_indices, {output}, {output_operand_type}, {output_is_nhwc});
+}
+
+#pragma endregion
+
 #pragma region CreateGetOpBuilders
 
 // The reason we use macros to create OpBuilders is for easy exclusion in build if certain op(s) are not used
@@ -2579,24 +2752,39 @@ Status EluOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const No
 static OpBuilderRegistrations CreateOpBuilderRegistrations() {
   OpBuilderRegistrations op_registrations;
 
+  // Builders handle a single op
+  NNAPI_EP_ADD_SINGLE_OP_BUILDER("BatchNormalization", BatchNormalizationOpBuilder);
+  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Cast", CastOpBuilder);
+  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Clip", ClipOpBuilder);
+  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Concat", ConcatOpBuilder);
+  NNAPI_EP_ADD_SINGLE_OP_BUILDER("DequantizeLinear", DequantizeLinearOpBuilder);
+  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Elu", EluOpBuilder);
+  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Flatten", FlattenOpBuilder);
+  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Identity", IdentityOpBuilder);
+  NNAPI_EP_ADD_SINGLE_OP_BUILDER("LRN", LRNOpBuilder);
+  NNAPI_EP_ADD_SINGLE_OP_BUILDER("QuantizeLinear", QuantizeLinearOpBuilder);
+  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Relu", ReluOpBuilder);
+  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Reshape", ReshapeOpBuilder);
+  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Resize", ResizeOpBuilder);
+  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Slice", SliceOpBuilder);
+  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Softmax", SoftMaxOpBuilder);
+  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Squeeze", SqueezeOpBuilder);
+  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Transpose", TransposeOpBuilder);
+
+  // Builders shared among similar ops
   {
     NNAPI_EP_ADD_SHARED_OP_BUILDER("Add", BinaryOpBuilder);
-    NNAPI_EP_ADD_SHARED_OP_BUILDER("Sub", BinaryOpBuilder);
-    NNAPI_EP_ADD_SHARED_OP_BUILDER("Mul", BinaryOpBuilder);
     NNAPI_EP_ADD_SHARED_OP_BUILDER("Div", BinaryOpBuilder);
-    NNAPI_EP_ADD_SHARED_OP_BUILDER("QLinearAdd", BinaryOpBuilder);
+    NNAPI_EP_ADD_SHARED_OP_BUILDER("Mul", BinaryOpBuilder);
     NNAPI_EP_ADD_SHARED_OP_BUILDER("Pow", BinaryOpBuilder);
+    NNAPI_EP_ADD_SHARED_OP_BUILDER("QLinearAdd", BinaryOpBuilder);
+    NNAPI_EP_ADD_SHARED_OP_BUILDER("Sub", BinaryOpBuilder);
   }
 
-  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Relu", ReluOpBuilder);
-  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Transpose", TransposeOpBuilder);
-  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Reshape", ReshapeOpBuilder);
-  NNAPI_EP_ADD_SINGLE_OP_BUILDER("BatchNormalization", BatchNormalizationOpBuilder);
-
   {
+    NNAPI_EP_ADD_SHARED_OP_BUILDER("AveragePool", PoolOpBuilder);
     NNAPI_EP_ADD_SHARED_OP_BUILDER("GlobalAveragePool", PoolOpBuilder);
     NNAPI_EP_ADD_SHARED_OP_BUILDER("GlobalMaxPool", PoolOpBuilder);
-    NNAPI_EP_ADD_SHARED_OP_BUILDER("AveragePool", PoolOpBuilder);
     NNAPI_EP_ADD_SHARED_OP_BUILDER("MaxPool", PoolOpBuilder);
     NNAPI_EP_ADD_SHARED_OP_BUILDER("QLinearAveragePool", PoolOpBuilder);
   }
@@ -2606,10 +2794,6 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
     NNAPI_EP_ADD_SHARED_OP_BUILDER("QLinearConv", ConvOpBuilder);
   }
 
-  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Cast", CastOpBuilder);
-  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Softmax", SoftMaxOpBuilder);
-  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Identity", IdentityOpBuilder);
-
   {
     NNAPI_EP_ADD_SHARED_OP_BUILDER("Gemm", GemmOpBuilder);
     NNAPI_EP_ADD_SHARED_OP_BUILDER("MatMul", GemmOpBuilder);
@@ -2621,30 +2805,19 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
     NNAPI_EP_ADD_SHARED_OP_BUILDER("Exp", UnaryOpBuilder);
     NNAPI_EP_ADD_SHARED_OP_BUILDER("Floor", UnaryOpBuilder);
     NNAPI_EP_ADD_SHARED_OP_BUILDER("Log", UnaryOpBuilder);
-    NNAPI_EP_ADD_SHARED_OP_BUILDER("Sigmoid", UnaryOpBuilder);
     NNAPI_EP_ADD_SHARED_OP_BUILDER("Neg", UnaryOpBuilder);
+    NNAPI_EP_ADD_SHARED_OP_BUILDER("QLinearSigmoid", UnaryOpBuilder);
+    NNAPI_EP_ADD_SHARED_OP_BUILDER("Sigmoid", UnaryOpBuilder);
     NNAPI_EP_ADD_SHARED_OP_BUILDER("Sin", UnaryOpBuilder);
     NNAPI_EP_ADD_SHARED_OP_BUILDER("Sqrt", UnaryOpBuilder);
     NNAPI_EP_ADD_SHARED_OP_BUILDER("Tanh", UnaryOpBuilder);
-    NNAPI_EP_ADD_SHARED_OP_BUILDER("QLinearSigmoid", UnaryOpBuilder);
   }
 
-  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Concat", ConcatOpBuilder);
-  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Squeeze", SqueezeOpBuilder);
-  NNAPI_EP_ADD_SINGLE_OP_BUILDER("QuantizeLinear", QuantizeLinearOpBuilder);
-  NNAPI_EP_ADD_SINGLE_OP_BUILDER("DequantizeLinear", DequantizeLinearOpBuilder);
-  NNAPI_EP_ADD_SINGLE_OP_BUILDER("LRN", LRNOpBuilder);
-  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Clip", ClipOpBuilder);
-  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Resize", ResizeOpBuilder);
-  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Flatten", FlattenOpBuilder);
-
   {
-    NNAPI_EP_ADD_SHARED_OP_BUILDER("Min", MinMaxOpBuilder);
     NNAPI_EP_ADD_SHARED_OP_BUILDER("Max", MinMaxOpBuilder);
+    NNAPI_EP_ADD_SHARED_OP_BUILDER("Min", MinMaxOpBuilder);
   }
 
-  NNAPI_EP_ADD_SINGLE_OP_BUILDER("Elu", EluOpBuilder);
-
   return op_registrations;
 }
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
index eecb5ad0f8..dd8f7b68e3 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
@@ -76,7 +76,8 @@ class BaseOpSupportChecker : public IOpSupportChecker {
     return true;
   }
 
-  virtual int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */, const OpSupportCheckParams& /* params */) const {
+  virtual int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */,
+                                                   const OpSupportCheckParams& /* params */) const {
     // ANEURALNETWORKS_FEATURE_LEVEL_1 is the baseline version of NNAPI,
     // There is no NNAPI support for Android API level 26-
     return ANEURALNETWORKS_FEATURE_LEVEL_1;
@@ -319,7 +320,8 @@ class TransposeOpSupportChecker : public BaseOpSupportChecker {
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
                          const OpSupportCheckParams& params) const override;
 
-  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */, const OpSupportCheckParams& /* params */) const override {
+  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */,
+                                           const OpSupportCheckParams& /* params */) const override {
     return ANEURALNETWORKS_FEATURE_LEVEL_2;
   }
 
@@ -490,7 +492,8 @@ class PoolOpSupportChecker : public BaseOpSupportChecker {
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
                          const OpSupportCheckParams& params) const override;
 
-  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */, const OpSupportCheckParams& params) const override {
+  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */,
+                                           const OpSupportCheckParams& params) const override {
     return params.use_nchw ? ANEURALNETWORKS_FEATURE_LEVEL_3 : ANEURALNETWORKS_FEATURE_LEVEL_2;
   }
 
@@ -667,7 +670,8 @@ class ConvOpSupportChecker : public BaseOpSupportChecker {
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
                          const OpSupportCheckParams& params) const override;
 
-  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */, const OpSupportCheckParams& params) const override {
+  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */,
+                                           const OpSupportCheckParams& params) const override {
     return params.use_nchw ? ANEURALNETWORKS_FEATURE_LEVEL_3 : ANEURALNETWORKS_FEATURE_LEVEL_2;
   }
 
@@ -775,7 +779,8 @@ class CastOpSupportChecker : public BaseOpSupportChecker {
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
                          const OpSupportCheckParams& params) const override;
 
-  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */, const OpSupportCheckParams& /* params */) const override {
+  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */,
+                                           const OpSupportCheckParams& /* params */) const override {
     return ANEURALNETWORKS_FEATURE_LEVEL_3;
   }
 
@@ -805,7 +810,8 @@ class SoftMaxOpSupportChecker : public BaseOpSupportChecker {
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
                          const OpSupportCheckParams& params) const override;
 
-  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */, const OpSupportCheckParams& /* params */) const override {
+  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */,
+                                           const OpSupportCheckParams& /* params */) const override {
     return ANEURALNETWORKS_FEATURE_LEVEL_2;
   }
 };
@@ -1043,7 +1049,8 @@ class UnaryOpSupportChecker : public BaseOpSupportChecker {
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
                          const OpSupportCheckParams& params) const override;
 
-  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& node, const OpSupportCheckParams& params) const override;
+  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& node,
+                                           const OpSupportCheckParams& params) const override;
 
   bool HasSupportedInputsImpl(const Node& node) const override;
 
@@ -1079,8 +1086,8 @@ bool UnaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initia
     return true;
 }
 
-int32_t UnaryOpSupportChecker::GetMinSupportedNNAPIFeatureLevel(
-    const Node& node, const OpSupportCheckParams& /* params */) const {
+int32_t UnaryOpSupportChecker::GetMinSupportedNNAPIFeatureLevel(const Node& node,
+                                                                const OpSupportCheckParams& /* params */) const {
   const auto& op(node.OpType());
   if (op == "Abs" ||
       op == "Exp" ||
@@ -1216,7 +1223,8 @@ class SqueezeOpSupportChecker : public BaseOpSupportChecker {
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
                          const OpSupportCheckParams& params) const override;
 
-  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */, const OpSupportCheckParams& /* params */) const override {
+  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */,
+                                           const OpSupportCheckParams& /* params */) const override {
     return ANEURALNETWORKS_FEATURE_LEVEL_2;
   }
 };
@@ -1255,7 +1263,8 @@ class QuantizeLinearOpSupportChecker : public BaseOpSupportChecker {
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
                          const OpSupportCheckParams& params) const override;
 
-  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */, const OpSupportCheckParams& /* params */) const override {
+  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */,
+                                           const OpSupportCheckParams& /* params */) const override {
     return ANEURALNETWORKS_FEATURE_LEVEL_3;
   }
 };
@@ -1296,7 +1305,8 @@ class DequantizeLinearOpSupportChecker : public BaseOpSupportChecker {
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
                          const OpSupportCheckParams& params) const override;
 
-  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */, const OpSupportCheckParams& /* params */) const override {
+  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */,
+                                           const OpSupportCheckParams& /* params */) const override {
     return ANEURALNETWORKS_FEATURE_LEVEL_1;
   }
   bool HasSupportedInputsImpl(const Node& node) const override;
@@ -1340,7 +1350,8 @@ class LRNOpSupportChecker : public BaseOpSupportChecker {
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
                          const OpSupportCheckParams& params) const override;
 
-  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */, const OpSupportCheckParams& /* params */) const override {
+  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */,
+                                           const OpSupportCheckParams& /* params */) const override {
     return ANEURALNETWORKS_FEATURE_LEVEL_2;
   }
 };
@@ -1397,7 +1408,8 @@ class ResizeOpSupportChecker : public BaseOpSupportChecker {
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
                          const OpSupportCheckParams& params) const override;
 
-  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */, const OpSupportCheckParams& /* params */) const override;
+  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */,
+                                           const OpSupportCheckParams& /* params */) const override;
 
   // Resize opset 10- is very different than Resize opset 11+, with many key attributes missing
   // We only support Resize opset 11+ here
@@ -1516,7 +1528,8 @@ bool ResizeOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi
   return true;
 }
 
-int32_t ResizeOpSupportChecker::GetMinSupportedNNAPIFeatureLevel(const Node& node, const OpSupportCheckParams& /* params */) const {
+int32_t ResizeOpSupportChecker::GetMinSupportedNNAPIFeatureLevel(const Node& node,
+                                                                 const OpSupportCheckParams& /* params */) const {
   int32_t input_type;
 
   // This should not happen, but if it happens make sure this will require an impossible version
@@ -1590,7 +1603,8 @@ class MinMaxOpSupportChecker : public BaseOpSupportChecker {
       const std::string& op_type, OpSupportCheckerRegistrations& op_registrations);
 
  private:
-  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */, const OpSupportCheckParams& /* params */) const override {
+  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */,
+                                           const OpSupportCheckParams& /* params */) const override {
     return ANEURALNETWORKS_FEATURE_LEVEL_3;
   }
 
@@ -1629,7 +1643,8 @@ bool MinMaxOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& /* in
 
 class EluOpSupportChecker : public BaseOpSupportChecker {
  private:
-  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */, const OpSupportCheckParams& /* params */) const override {
+  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */,
+                                           const OpSupportCheckParams& /* params */) const override {
     return ANEURALNETWORKS_FEATURE_LEVEL_4;
   }
 
@@ -1639,6 +1654,63 @@ class EluOpSupportChecker : public BaseOpSupportChecker {
 
 #pragma endregion
 
+#pragma region op_slice
+
+class SliceOpSupportChecker : public BaseOpSupportChecker {
+ private:
+  int32_t GetMinSupportedNNAPIFeatureLevel(const Node& /* node */,
+                                           const OpSupportCheckParams& /* params */) const override {
+    return ANEURALNETWORKS_FEATURE_LEVEL_2;
+  }
+
+  // We only support slice from opset 10
+  int GetMinSupportedOpSet(const Node& /* node */) const override { return 10; }
+
+  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+                         const OpSupportCheckParams& params) const override;
+};
+
+bool SliceOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+                                              const OpSupportCheckParams& /* params */) const {
+  Shape input_shape;
+  if (!GetShape(*node.InputDefs()[0], input_shape))
+    return false;
+
+  if (input_shape.size() > 4) {
+    LOGS_DEFAULT(VERBOSE) << "Slice only supports 1-4d shape, input is "
+                          << input_shape.size() << "d shape";
+    return false;
+  }
+
+  // TODO, replace with std::find when we switch to c++17
+  if (std::any_of(input_shape.cbegin(), input_shape.cend(), [](int32_t i) { return i == 0; })) {
+    LOGS_DEFAULT(VERBOSE) << "Slice doesn't support dynamic input shape";
+    return false;
+  }
+
+  if (!CheckIsInitializer(initializers, node, 1, "starts")) {
+    return false;
+  }
+  if (!CheckIsInitializer(initializers, node, 2, "ends")) {
+    return false;
+  }
+  const auto& input_defs = node.InputDefs();
+  if (input_defs.size() > 3) {
+    if (!CheckIsInitializer(initializers, node, 3, "axes")) {
+      return false;
+    }
+    if (input_defs.size() > 4) {
+      if (!CheckIsInitializer(initializers, node, 4, "steps")) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+#pragma endregion
+
 #pragma region CreateGetOpSupportCheckers
 
 // The reason we use macros to create OpBuilders is for easy exclusion in build if certain op(s) are not used
@@ -1657,26 +1729,43 @@ class EluOpSupportChecker : public BaseOpSupportChecker {
 static OpSupportCheckerRegistrations CreateOpSupportCheckerRegistrations() {
   OpSupportCheckerRegistrations op_registrations;
 
-  {
-    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Add", BinaryOpSupportChecker);
-    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Sub", BinaryOpSupportChecker);
-    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Mul", BinaryOpSupportChecker);
-    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Div", BinaryOpSupportChecker);
-    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("QLinearAdd", BinaryOpSupportChecker);
-    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Pow", BinaryOpSupportChecker);
-  }
+  // Support checkers handle a single op
+  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("BatchNormalization", BatchNormalizationOpSupportChecker);
+  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Cast", CastOpSupportChecker);
+  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Clip", ClipOpSupportChecker);
+  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Concat", ConcatOpSupportChecker);
+  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("DequantizeLinear", DequantizeLinearOpSupportChecker);
+  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Elu", EluOpSupportChecker);
+  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Flatten", FlattenOpSupportChecker);
+  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("LRN", LRNOpSupportChecker);
+  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("QuantizeLinear", QuantizeLinearOpSupportChecker);
+  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Reshape", ReshapeOpSupportChecker);
+  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Resize", ResizeOpSupportChecker);
+  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Slice", SliceOpSupportChecker);
+  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Softmax", SoftMaxOpSupportChecker);
+  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Squeeze", SqueezeOpSupportChecker);
+  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Transpose", TransposeOpSupportChecker);
+
+  // Identity is always supported, we use BaseOpSupportChecker as default
+  NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Identity", BaseOpSupportChecker);
 
   // Relu is always supported, we use BaseOpSupportChecker as default
   NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Relu", BaseOpSupportChecker);
 
-  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Transpose", TransposeOpSupportChecker);
-  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Reshape", ReshapeOpSupportChecker);
-  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("BatchNormalization", BatchNormalizationOpSupportChecker);
+  // Support Checkers shared among similar ops
+  {
+    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Add", BinaryOpSupportChecker);
+    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Div", BinaryOpSupportChecker);
+    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Mul", BinaryOpSupportChecker);
+    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Pow", BinaryOpSupportChecker);
+    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("QLinearAdd", BinaryOpSupportChecker);
+    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Sub", BinaryOpSupportChecker);
+  }
 
   {
+    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("AveragePool", PoolOpSupportChecker);
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("GlobalAveragePool", PoolOpSupportChecker);
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("GlobalMaxPool", PoolOpSupportChecker);
-    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("AveragePool", PoolOpSupportChecker);
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("MaxPool", PoolOpSupportChecker);
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("QLinearAveragePool", PoolOpSupportChecker);
   }
@@ -1686,12 +1775,6 @@ static OpSupportCheckerRegistrations CreateOpSupportCheckerRegistrations() {
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("QLinearConv", ConvOpSupportChecker);
   }
 
-  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Cast", CastOpSupportChecker);
-  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Softmax", SoftMaxOpSupportChecker);
-
-  // Identity is always supported, we use BaseOpSupportChecker as default
-  NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Identity", BaseOpSupportChecker);
-
   {
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Gemm", GemmOpSupportChecker);
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("MatMul", GemmOpSupportChecker);
@@ -1703,30 +1786,19 @@ static OpSupportCheckerRegistrations CreateOpSupportCheckerRegistrations() {
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Exp", UnaryOpSupportChecker);
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Floor", UnaryOpSupportChecker);
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Log", UnaryOpSupportChecker);
-    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Sigmoid", UnaryOpSupportChecker);
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Neg", UnaryOpSupportChecker);
+    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("QLinearSigmoid", UnaryOpSupportChecker);
+    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Sigmoid", UnaryOpSupportChecker);
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Sin", UnaryOpSupportChecker);
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Sqrt", UnaryOpSupportChecker);
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Tanh", UnaryOpSupportChecker);
-    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("QLinearSigmoid", UnaryOpSupportChecker);
   }
 
-  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Concat", ConcatOpSupportChecker);
-  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Squeeze", SqueezeOpSupportChecker);
-  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("QuantizeLinear", QuantizeLinearOpSupportChecker);
-  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("DequantizeLinear", DequantizeLinearOpSupportChecker);
-  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("LRN", LRNOpSupportChecker);
-  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Clip", ClipOpSupportChecker);
-  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Resize", ResizeOpSupportChecker);
-  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Flatten", FlattenOpSupportChecker);
-
   {
-    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Min", MinMaxOpSupportChecker);
     NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Max", MinMaxOpSupportChecker);
+    NNAPI_EP_ADD_SHARED_OP_SUPPORT_CHECKER("Min", MinMaxOpSupportChecker);
   }
 
-  NNAPI_EP_ADD_SINGLE_OP_SUPPORT_CHECKER("Elu", EluOpSupportChecker);
-
   return op_registrations;
 }
 
diff --git a/onnxruntime/test/providers/cpu/tensor/slice_op.test.cc b/onnxruntime/test/providers/cpu/tensor/slice_op.test.cc
index a194095a15..4389508fd3 100644
--- a/onnxruntime/test/providers/cpu/tensor/slice_op.test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/slice_op.test.cc
@@ -19,8 +19,18 @@ void RunSliceTest(const std::vector<int64_t>& input_dims,
                   const std::vector<int64_t>& output_dims,
                   const std::vector<T>& output_vals,
                   bool v10_only = false) {
-  // V1-9
-  ORT_UNUSED_PARAMETER(steps);
+  std::unordered_set<std::string> excluded_providers;
+
+  if (!v10_only)
+    excluded_providers = {kTensorrtExecutionProvider, kOpenVINOExecutionProvider};
+  else
+    excluded_providers = {kTensorrtExecutionProvider};
+
+  // NNAPI EP does not support empty output
+  if (std::any_of(output_dims.cbegin(), output_dims.cend(), [](int64_t i) { return i == 0; })) {
+    excluded_providers.insert(kNnapiExecutionProvider);
+  }
+
   if (!v10_only) {
     OpTester testv9("Slice", 9);
     testv9.AddAttribute("starts", starts);
@@ -29,20 +39,27 @@ void RunSliceTest(const std::vector<int64_t>& input_dims,
       testv9.AddAttribute("axes", axes);
     testv9.AddInput<T>("data", input_dims, input_vals);
     testv9.AddOutput<T>("output", output_dims, output_vals);
-    testv9.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); // OpenVINO EP: Disabled temporarily
+    testv9.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_providers);  // OpenVINO EP: Disabled temporarily
   }
 
   // V10
-  OpTester testv10("Slice", 10);
-  testv10.AddInput<T>("data", input_dims, input_vals);
-  testv10.AddInput<int64_t>("starts", {static_cast<int64_t>(starts.size())}, starts);
-  testv10.AddInput<int64_t>("ends", {static_cast<int64_t>(ends.size())}, ends);
-  if (axes.size() != 0)
-    testv10.AddInput<int64_t>("axes", {static_cast<int64_t>(axes.size())}, axes);
-  if (steps.size() != 0)
-    testv10.AddInput<int64_t>("steps", {static_cast<int64_t>(steps.size())}, steps);
-  testv10.AddOutput<T>("output", output_dims, output_vals);
-  testv10.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  auto run_test = [&](bool only_data_not_initializer) {
+    OpTester testv10("Slice", 10);
+    testv10.AddInput<T>("data", input_dims, input_vals);
+    testv10.AddInput<int64_t>("starts", {static_cast<int64_t>(starts.size())}, starts, only_data_not_initializer);
+    testv10.AddInput<int64_t>("ends", {static_cast<int64_t>(ends.size())}, ends, only_data_not_initializer);
+    if (axes.size() != 0)
+      testv10.AddInput<int64_t>("axes", {static_cast<int64_t>(axes.size())}, axes, only_data_not_initializer);
+    if (steps.size() != 0)
+      testv10.AddInput<int64_t>("steps", {static_cast<int64_t>(steps.size())}, steps, only_data_not_initializer);
+    testv10.AddOutput<T>("output", output_dims, output_vals);
+    testv10.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_providers);
+  };
+
+  run_test(false);
+
+  // NNAPI EP requires the starts/ends/axes/steps be initializers
+  run_test(true);
 }
 
 // Slice V1-9 & Slice V10 can both run the following tests