Initial PR for VSINPU execution provider (#20903)

### Description  -It is an initial PR for VSINPU execution provider ### Motivation and Context  - For support VeriSilicon hardware - TIM-VX(Tensor Interface Module) (https://github.com/VeriSilicon/TIM-VX) is an integrated software solution by Verisilicon for our hardware(A311D/i.MX 8M Plus etc.) design, it is easy to use Verisilicon’s hardware by simply connecting onnxruntime with the TIM-VX API by this VSINPU execution provider.
2026-07-02 03:55:34 +00:00 · 2024-06-29 12:48:34 +08:00 · 2024-06-29 12:48:34 +08:00 · 56b36a58ba
commit 56b36a58ba
parent 9007ede102
65 changed files with 5096 additions and 3 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -101,6 +101,7 @@ option(onnxruntime_BUILD_OBJC "Build Objective-C library" OFF)
 option(onnxruntime_USE_PREINSTALLED_EIGEN "Use pre-installed EIGEN. Need to provide eigen_SOURCE_PATH if turn this on." OFF)
 option(onnxruntime_BUILD_BENCHMARKS "Build ONNXRuntime micro-benchmarks" OFF)
 option(onnxruntime_USE_LLVM "Build TVM with LLVM" OFF)
+option(onnxruntime_USE_VSINPU "Build with VSINPU support" OFF)

 cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
 option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON)
@ -797,6 +798,11 @@ if (onnxruntime_USE_RKNPU)
    list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_RKNPU=1)
    list(APPEND ONNXRUNTIME_PROVIDER_NAMES rknpu)
 endif()
+if (onnxruntime_USE_VSINPU)
+    list(APPEND ORT_PROVIDER_FLAGS -DUSE_VSINPU=1)
+    list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_VSINPU=1)
+    list(APPEND ONNXRUNTIME_PROVIDER_NAMES vsinpu)
+endif()
 if (onnxruntime_USE_NNAPI_BUILTIN)
    list(APPEND ORT_PROVIDER_FLAGS -DUSE_NNAPI=1)
    list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_NNAPI_BUILTIN=1)
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@ -189,6 +189,7 @@ set(onnxruntime_INTERNAL_LIBRARIES
  ${PROVIDERS_SNPE}
  ${PROVIDERS_TVM}
  ${PROVIDERS_RKNPU}
+  ${PROVIDERS_VSINPU}
  ${PROVIDERS_XNNPACK}
  ${PROVIDERS_WEBNN}
  ${PROVIDERS_AZURE}
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@ -80,6 +80,9 @@ endif()
 if(onnxruntime_USE_RKNPU)
  set(PROVIDERS_RKNPU onnxruntime_providers_rknpu)
 endif()
+if(onnxruntime_USE_VSINPU)
+  set(PROVIDERS_VSINPU onnxruntime_providers_vsinpu)
+endif()
 if(onnxruntime_USE_DML)
  set(PROVIDERS_DML onnxruntime_providers_dml)
 endif()
@ -188,6 +191,35 @@ if (onnxruntime_USE_TVM)
  include(onnxruntime_providers_tvm.cmake)
 endif()

+if (onnxruntime_USE_VSINPU)
+  add_definitions(-DUSE_VSINPU=1)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
+  file(GLOB_RECURSE onnxruntime_providers_vsinpu_srcs
+    "${ONNXRUNTIME_ROOT}/core/providers/vsinpu/builders/*.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/vsinpu/builders/*.cc"
+    "${ONNXRUNTIME_ROOT}/core/providers/vsinpu/*.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/vsinpu/*.cc"
+    "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.cc"
+  )
+  source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_vsinpu_srcs})
+  add_library(onnxruntime_providers_vsinpu ${onnxruntime_providers_vsinpu_srcs})
+  onnxruntime_add_include_to_target(onnxruntime_providers_vsinpu
+    onnxruntime_common onnxruntime_framework onnx onnx_proto protobuf::libprotobuf-lite flatbuffers Boost::mp11
+    safeint_interface nsync::nsync_cpp)
+  add_dependencies(onnxruntime_providers_vsinpu ${onnxruntime_EXTERNAL_DEPENDENCIES})
+  set_target_properties(onnxruntime_providers_vsinpu PROPERTIES FOLDER "ONNXRuntime" LINKER_LANGUAGE CXX)
+  target_include_directories(onnxruntime_providers_vsinpu PRIVATE ${ONNXRUNTIME_ROOT} $ENV{TIM_VX_INSTALL}/include)
+
+  find_library(TIMVX_LIBRARY NAMES tim-vx PATHS $ENV{TIM_VX_INSTALL}/lib NO_DEFAULT_PATH)
+  if(TIMVX_LIBRARY)
+    target_link_libraries(onnxruntime_providers_vsinpu PRIVATE ${TIMVX_LIBRARY})
+  else()
+    message(FATAL_ERROR "Cannot find TIM-VX library!")
+  endif()
+
+endif()
+
 if (onnxruntime_USE_XNNPACK)
  include(onnxruntime_providers_xnnpack.cmake)
 endif()
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@ -546,6 +546,10 @@ if(onnxruntime_USE_NNAPI_BUILTIN)
  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_nnapi)
 endif()

+if(onnxruntime_USE_VSINPU)
+  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_vsinpu)
+endif()
+
 if(onnxruntime_USE_JSEP)
  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_js)
 endif()
@ -589,6 +593,7 @@ set(ONNXRUNTIME_TEST_LIBS
    ${onnxruntime_libs}
    # CUDA, ROCM, TENSORRT, MIGRAPHX, DNNL, and OpenVINO are dynamically loaded at runtime
    ${PROVIDERS_NNAPI}
+    ${PROVIDERS_VSINPU}
    ${PROVIDERS_JS}
    ${PROVIDERS_QNN}
    ${PROVIDERS_SNPE}
--- a/include/onnxruntime/core/graph/constants.h
+++ b/include/onnxruntime/core/graph/constants.h
@ -52,6 +52,7 @@ constexpr const char* kXnnpackExecutionProvider = "XnnpackExecutionProvider";
 constexpr const char* kWebNNExecutionProvider = "WebNNExecutionProvider";
 constexpr const char* kCannExecutionProvider = "CANNExecutionProvider";
 constexpr const char* kAzureExecutionProvider = "AzureExecutionProvider";
+constexpr const char* kVSINPUExecutionProvider = "VSINPUExecutionProvider";

 constexpr const char* kExecutionProviderSharedLibraryPath = "shared_lib_path";
 constexpr const char* kExecutionProviderSharedLibraryEntry = "provider_factory_entry_point";
--- a/include/onnxruntime/core/providers/vsinpu/vsinpu_provider_factory.h
+++ b/include/onnxruntime/core/providers/vsinpu/vsinpu_provider_factory.h
@ -0,0 +1,34 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include "onnxruntime_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_VSINPU, _In_ OrtSessionOptions* options);
+
+#ifdef __cplusplus
+}
+#endif
--- a/onnxruntime/core/framework/node_unit.cc
+++ b/onnxruntime/core/framework/node_unit.cc
@ -285,7 +285,7 @@ void NodeUnit::InitForSingleNode() {
  const auto& output_defs = target_node_.OutputDefs();
  const auto& node_attrs = target_node_.GetAttributes();
  auto qlinear_type = GetQLinearOpType(target_node_);
-  if (qlinear_type == QLinearOpType::Unknown || IsVariadicQLinearOp(qlinear_type)) {  // TODO, add variadic support
+  if (qlinear_type == QLinearOpType::Unknown) {
    // Not a Qlinear op, add all inputs / outputs
    auto add_all_io = [](std::vector<NodeUnitIODef>& defs,
                         const ConstPointerContainer<std::vector<NodeArg*>>& node_defs) {
@ -351,6 +351,13 @@ void NodeUnit::InitForSingleNode() {
                                     NodeUnitIODef::QuantParam{*input_defs[1],
                                                               input_defs.size() == 3 ? input_defs[2] : nullptr,
                                                               axis}});
+  } else if (IsVariadicQLinearOp(qlinear_type)) {
+    size_t input_num = (input_defs.size() - 2) / 3;
+    for (size_t i = 0; i < input_num; i++) {
+      inputs_.push_back(NodeUnitIODef{*input_defs[3 * i + 2], NodeUnitIODef::QuantParam{*input_defs[3 * i + 3],
+                                                                                        input_defs[3 * i + 4]}});
+    }
+    outputs_.push_back(NodeUnitIODef{*output_defs[0], NodeUnitIODef::QuantParam{*input_defs[0], input_defs[1]}});
  } else {
    ORT_THROW("The QLinear op [", static_cast<uint8_t>(qlinear_type), "] is not supported");
  }
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@ -61,6 +61,7 @@ bool ProviderIsCpuBased(const std::string& provider_type) {
         provider_type == onnxruntime::kVitisAIExecutionProvider ||
         provider_type == onnxruntime::kOpenVINOExecutionProvider ||
         provider_type == onnxruntime::kNnapiExecutionProvider ||
+         provider_type == onnxruntime::kVSINPUExecutionProvider ||
         provider_type == onnxruntime::kAclExecutionProvider ||
         provider_type == onnxruntime::kArmNNExecutionProvider ||
         provider_type == onnxruntime::kRknpuExecutionProvider ||
--- a/onnxruntime/core/providers/get_execution_providers.cc
+++ b/onnxruntime/core/providers/get_execution_providers.cc
@ -98,6 +98,14 @@ constexpr ProviderInfo kProvidersInPriorityOrder[] =
            true,
 #else
            false,
+#endif
+        },
+        {
+            kVSINPUExecutionProvider,
+#ifdef USE_VSINPU
+            true,
+#else
+            false,
 #endif
        },
        {
--- a/onnxruntime/core/providers/provider_factory_creators.h
+++ b/onnxruntime/core/providers/provider_factory_creators.h
@ -46,6 +46,10 @@
 #include "core/providers/nnapi/nnapi_provider_factory_creator.h"
 #endif

+#if defined(USE_VSINPU)
+#include "core/providers/vsinpu/vsinpu_provider_factory_creator.h"
+#endif
+
 #if defined(USE_JSEP)
 #include "core/providers/js/js_provider_factory_creator.h"
 #endif
--- a/onnxruntime/core/providers/vsinpu/builders/impl/activation_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/activation_op_builder.h
@ -0,0 +1,130 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class ReluOpBuilder : public BaseOpBuilder {
+ public:
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating Relu Activation.";
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Relu>();
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+class SigmoidOpBuilder : public BaseOpBuilder {
+ public:
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating Sigmoid Activation.";
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Sigmoid>();
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+class TanhOpBuilder : public BaseOpBuilder {
+ public:
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating Tanh activation.";
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Tanh>();
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+
+class LeakyReluOpBuilder : public BaseOpBuilder {
+ public:
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating LeakyRelu activation.";
+    const auto& node = node_unit.GetNode();
+    NodeAttrHelper helper(node);
+    auto alpha = helper.Get("alpha", 1.0f);
+    auto op =
+        graph_ep->GetGraph()->CreateOperation<tim::vx::ops::LeakyRelu>(alpha);
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+
+class EluOpBuilder : public BaseOpBuilder {
+ public:
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating Elu activation.";
+    const auto& node = node_unit.GetNode();
+    NodeAttrHelper helper(node);
+    auto alpha = helper.Get("alpha", 1.0f);
+    auto op =
+        graph_ep->GetGraph()->CreateOperation<tim::vx::ops::LeakyRelu>(alpha);
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+
+class HardSigmoidOpBuilder : public BaseOpBuilder {
+ public:
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating HardSigmoid activation.";
+    const auto& node = node_unit.GetNode();
+    NodeAttrHelper helper(node);
+    auto alpha = helper.Get("alpha", 1.0f);
+    auto beta = helper.Get("beta", 1.0f);
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::HardSigmoid>(
+        alpha, beta);
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/base_op_builder.cc
@ -0,0 +1,205 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <string>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+bool BaseOpBuilder::IsSupported(const onnxruntime::GraphViewer& graph_viewer,
+                                const NodeUnit& node_unit) const {
+  auto initializers = graph_viewer.GetAllInitializedTensors();
+  if (!HasSupportedOpSet(node_unit)) {
+    return false;
+  }
+  if (!HasSupportedInputOutputs(initializers, node_unit)) {
+    return false;
+  }
+  return IsOpSupported(graph_viewer, &node_unit.GetNode());
+}
+
+bool BaseOpBuilder::HasSupportedInputOutputs(const InitializedTensorSet& initializers,
+                                             const NodeUnit& node_unit) const {
+  // We do not support unknown(null) input shape
+  auto has_supported_shape = [](const NodeArg& node_arg, const std::string& name, const std::string& op_type) {
+    const auto* shape_proto = node_arg.Shape();
+    if (!shape_proto) {
+      LOGS_DEFAULT(WARNING) << "Node [" << name << "] type [" << op_type
+                            << "] Input [" << node_arg.Name() << "] has no shape";
+      return false;
+    }
+
+    // We do not support dynamic shape input yet, but resize op's second input can be empty
+    for (const auto& dim : shape_proto->dim()) {
+      if (!dim.has_dim_value()) {
+        LOGS_DEFAULT(WARNING) << "Dynamic shape is not supported for now, for input:" << node_arg.Name();
+        return false;
+      }
+      if (dim.dim_value() == 0 && op_type != "Resize") {
+        LOGS_DEFAULT(WARNING) << "Zero in shape is not supported for now, for input:" << node_arg.Name();
+        return false;
+      }
+    }
+    return true;
+  };
+
+  auto has_initialized_quant_param = [](const NodeArg& arg, const InitializedTensorSet& initializers) {
+    auto it = initializers.find(arg.Name());
+    if (it == initializers.end()) {
+      LOGS_DEFAULT(WARNING) << "The quantization param must be an initializer tensor";
+      return false;
+    }
+    return true;
+  };
+
+  for (const auto& input : node_unit.Inputs()) {
+    if (!input.node_arg.Exists()) {
+      continue;
+    }
+    if (!has_supported_shape(input.node_arg, node_unit.Name(), node_unit.OpType()))
+      return false;
+
+    if (input.quant_param.has_value()) {
+      if (!has_supported_shape(input.quant_param->scale, node_unit.Name(), node_unit.OpType()))
+        return false;
+
+      if (!has_initialized_quant_param(input.quant_param->scale, initializers))
+        return false;
+      // zero point is optional
+      if (input.quant_param->zero_point) {
+        if (!has_supported_shape(*input.quant_param->zero_point, node_unit.Name(), node_unit.OpType()))
+          return false;
+        if (!has_initialized_quant_param(*input.quant_param->zero_point, initializers))
+          return false;
+        if (input.quant_param->zero_point->Type() != input.node_arg.Type()) {
+          LOGS_DEFAULT(ERROR) << "Invalid input type because the data type mismatch with its' quant param type.";
+          return false;
+        }
+      }
+    }
+  }
+  for (const auto& output : node_unit.Outputs()) {
+    if (output.quant_param.has_value()) {
+      if (!has_supported_shape(output.quant_param->scale, node_unit.Name(), node_unit.OpType()))
+        return false;
+
+      if (!has_initialized_quant_param(output.quant_param->scale, initializers))
+        return false;
+      // zero point is optional
+      if (output.quant_param->zero_point) {
+        if (!has_supported_shape(*output.quant_param->zero_point, node_unit.Name(), node_unit.OpType()))
+          return false;
+        if (!has_initialized_quant_param(*output.quant_param->zero_point, initializers))
+          return false;
+      }
+    }
+  }
+  return HasSupportedInputOutputsImpl(initializers, node_unit);
+}
+
+bool BaseOpBuilder::HasSupportedInputOutputsImpl(
+    const InitializedTensorSet& /* initializers */, const NodeUnit& node_unit) const {
+  // Check input/output data type, int64 is generally unsupported
+  // specific op builder can override this if the int64 input corresponds to VSINPU param
+  for (const auto& input : node_unit.Inputs()) {
+    auto input_type = input.node_arg.Type();
+    if (*input_type == "tensor(int64)" || !util::IsTypeSupported(&input.node_arg)) {
+      LOGS_DEFAULT(WARNING) << node_unit.OpType() << " has unsupported input type : "
+                            << *input_type;
+      return false;
+    }
+  }
+  for (const auto& output : node_unit.Outputs()) {
+    auto output_type = output.node_arg.Type();
+    if (*output_type == "tensor(int64)" || !util::IsTypeSupported(&output.node_arg)) {
+      LOGS_DEFAULT(WARNING) << node_unit.OpType() << " has unsupported output type : "
+                            << *output_type;
+      return false;
+    }
+  }
+  return true;
+}
+
+bool BaseOpBuilder::HasSupportedOpSet(const NodeUnit& node_unit) const {
+  auto since_version = node_unit.SinceVersion();
+  if (since_version < GetMinSupportedOpSet(node_unit) || since_version > GetMaxSupportedOpSet(node_unit)) {
+    LOGS_DEFAULT(VERBOSE) << node_unit.OpType() << " opset [" << since_version
+                          << "] is only supported for opset ["
+                          << GetMinSupportedOpSet(node_unit) << ", "
+                          << GetMaxSupportedOpSet(node_unit) << "]";
+    return false;
+  }
+
+  return true;
+}
+
+bool BaseOpBuilder::BuildOp(vsi::npu::GraphEP* graph_ep,
+                            const onnxruntime::GraphViewer& graph_viewer,
+                            const NodeUnit& node_unit) {
+  std::vector<std::shared_ptr<tim::vx::Tensor>> inputs;
+  std::vector<NodeUnitIODef> input_defs = node_unit.Inputs();
+  std::vector<NodeUnitIODef> output_defs = node_unit.Outputs();
+
+  for (const auto input_def : input_defs) {
+    auto it = std::find_if(
+        graph_ep->GetGraphInputs().begin(), graph_ep->GetGraphInputs().end(),
+        [input_def](const std::shared_ptr<GraphIOInfo>& info) {
+          return info->name == input_def.node_arg.Name();
+        });
+    tim::vx::TensorAttribute attr;
+    if (graph_viewer.IsConstantInitializer(input_def.node_arg.Name(), true)) {
+      attr = tim::vx::TensorAttribute::CONSTANT;
+    } else if (it == graph_ep->GetGraphInputs().end()) {
+      attr = tim::vx::TensorAttribute::TRANSIENT;
+    } else {
+      attr = tim::vx::TensorAttribute::INPUT;
+    }
+
+    auto tensor = graph_ep->MapTIMVXTensor(graph_ep->GetGraph(), input_def, node_unit,
+                                           &graph_viewer, attr);
+    inputs.push_back(tensor);
+  }
+
+  std::vector<std::shared_ptr<tim::vx::Tensor>> outputs;
+
+  for (auto output_def : output_defs) {
+    auto it = std::find_if(
+        graph_ep->GetGraphOutputs().begin(), graph_ep->GetGraphOutputs().end(),
+        [output_def](const std::shared_ptr<GraphIOInfo>& info) {
+          return info->name == output_def.node_arg.Name();
+        });
+    tim::vx::TensorAttribute attribute =
+        it == graph_ep->GetGraphOutputs().end()
+            ? tim::vx::TensorAttribute::TRANSIENT
+            : tim::vx::TensorAttribute::OUTPUT;
+    auto tensor = graph_ep->MapTIMVXTensor(graph_ep->GetGraph(), output_def, node_unit,
+                                           &graph_viewer, attribute);
+    outputs.push_back(tensor);
+  }
+  return HandleBuildOp(graph_ep, inputs, outputs, node_unit);
+}
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/base_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/base_op_builder.h
@ -0,0 +1,75 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#pragma once
+#include <memory>
+#include <vector>
+#include "core/providers/vsinpu/builders/op_builder.h"
+#include "core/providers/vsinpu/vsinpu_ep_graph.h"
+#include "core/providers/vsinpu/vsinpu_util.h"
+#include "tim/vx/operation.h"
+#include "tim/vx/ops.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class BaseOpBuilder : public IOpBuilder {
+ public:
+  virtual ~BaseOpBuilder() = default;
+
+  bool IsSupported(const onnxruntime::GraphViewer& graph_viewer,
+                   const NodeUnit& node_unit) const override;
+  bool BuildOp(vsi::npu::GraphEP* graph_ep,
+               const onnxruntime::GraphViewer& graph_viewer, const NodeUnit& node_unit);
+  virtual bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                             const Node* node) const {
+    return true;
+  }
+
+  virtual bool IsQuantizedOp(const NodeUnit& /* node_unit */) const { return false; }
+
+  virtual int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const { return 1; }
+  virtual int GetMaxSupportedOpSet(const NodeUnit& /* node_unit */) const { return 22; }
+
+  virtual bool HasSupportedInputOutputsImpl(
+      const InitializedTensorSet& initializers, const NodeUnit& node_unit) const;
+
+  // TODO(cfy): Check if this node_unit's type is supported
+  virtual bool IsNodeUnitTypeSupported(const NodeUnit& node_unit) const { return true; }
+
+  virtual bool HandleBuildOp(
+      vsi::npu::GraphEP* graph_ep,
+      std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+      std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+      const NodeUnit& node_unit) {
+    return true;
+  }
+
+ private:
+  bool HasSupportedOpSet(const NodeUnit& node_unit) const;
+  bool HasSupportedInputOutputs(const InitializedTensorSet& initializers,
+                                const NodeUnit& node_unit) const;
+};
+}  // namespace npu
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/cast_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/cast_op_builder.h
@ -0,0 +1,47 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class CastOpBuilder : public BaseOpBuilder {
+ protected:
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep, std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs, const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating Cast Op.";
+    NodeAttrHelper helper(node_unit.GetNode());
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::DataConvert>();
+    (*op).BindInput(inputs[0]).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+
+}  // namespace npu
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/clip_op_builder.cc
@ -0,0 +1,115 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <limits>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/clip_op_builder.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+
+namespace clip_internal {
+template <typename T>
+struct LowMax {
+  constexpr static T low() {
+    return std::numeric_limits<T>::lowest();
+  }
+  constexpr static T max() {
+    return std::numeric_limits<T>::max();
+  }
+};
+}  // namespace clip_internal
+
+template <typename T>
+struct ClipOpBuilder::ClipImpl {
+  ClipImpl(vsi::npu::GraphEP* graph_ep, std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+           std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs) {
+    T min_default = clip_internal::LowMax<T>::low();
+    T max_default = clip_internal::LowMax<T>::max();
+
+    T* min_data = &min_default;
+    T* max_data = &max_default;
+    std::shared_ptr<tim::vx::Tensor> min_tensor = nullptr;
+    std::shared_ptr<tim::vx::Tensor> max_tensor = nullptr;
+    if (inputs.size() > 1) {
+      min_tensor = inputs[1];
+      if (inputs.size() > 2) {
+        max_tensor = inputs[2];
+      }
+    }
+    if (min_tensor) {
+      min_tensor->CopyDataFromTensor(min_data);
+    }
+    if (max_tensor) {
+      max_tensor->CopyDataFromTensor(max_data);
+    }
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Clip>(
+        static_cast<float>(*min_data), static_cast<float>(*max_data));
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+  }
+};
+
+bool ClipOpBuilder::HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                                  std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                                  std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                                  const NodeUnit& node_unit) {
+  LOGS_DEFAULT(INFO) << "Creating Clip Op.";
+  if (node_unit.SinceVersion() <= 6) {
+    NodeAttrHelper helper(node_unit.GetNode());
+    auto min = helper.Get("min", -3.402e+38f);
+    auto max = helper.Get("max", 3.402e+38f);
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Clip>(min, max);
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+  } else {
+    switch (inputs[0]->GetDataType()) {
+      case tim::vx::DataType::INT8:
+        ClipImpl<int8_t>(graph_ep, inputs, outputs);
+        break;
+      case tim::vx::DataType::UINT8:
+        ClipImpl<uint8_t>(graph_ep, inputs, outputs);
+        break;
+      case tim::vx::DataType::INT16:
+        ClipImpl<int16_t>(graph_ep, inputs, outputs);
+        break;
+      case tim::vx::DataType::INT32:
+        ClipImpl<int32_t>(graph_ep, inputs, outputs);
+        break;
+      case tim::vx::DataType::FLOAT16:
+        ClipImpl<Ort::Float16_t>(graph_ep, inputs, outputs);
+        break;
+      case tim::vx::DataType::FLOAT32:
+      default:
+        ClipImpl<float>(graph_ep, inputs, outputs);
+        break;
+    }
+  }
+  return true;
+}
+
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/clip_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/clip_op_builder.h
@ -0,0 +1,57 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class ClipOpBuilder final : public BaseOpBuilder {
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                     const Node* node) const override {
+    if (node->SinceVersion() > 6) {
+      if (node->InputDefs().size() > 1 &&
+          !Contains(graph_viewer.GetAllInitializedTensors(), node->InputDefs()[1]->Name())) {
+        LOGS_DEFAULT(WARNING) << "Min/Max value must be const input or attribute.";
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override;
+
+ private:
+  template <typename T>
+  struct ClipImpl;
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/concat_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/concat_op_builder.h
@ -0,0 +1,65 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+#include "core/providers/common.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class ConcatOpBuilder : public BaseOpBuilder {
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                     const Node* node) const override {
+    NodeAttrHelper helper(*node);
+    auto axis = helper.Get("axis", 0);
+    auto input_defs = node->InputDefs();
+    auto input_shape = vsi::npu::util::GetTensorShape(*input_defs[0]);
+    int32_t rank = input_shape.NumDimensions();
+    if (axis >= rank || axis < -rank) {
+      LOGS_DEFAULT(ERROR) << "Axis is invalid in Concat.";
+      return false;
+    }
+    return true;
+  }
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating Concat Op.";
+    NodeAttrHelper helper(node_unit.GetNode());
+    auto axis = helper.Get("axis", 0);
+    axis = util::ReverseAxis(axis, inputs[0]->GetShape().size());
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Concat>(static_cast<uint32_t>(axis), inputs.size());
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/conv_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/conv_op_builder.h
@ -0,0 +1,162 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <string>
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class ConvOpBuilder : public BaseOpBuilder {
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                     const Node* node) const override {
+    auto input_defs = node->InputDefs();
+    auto shape = vsi::npu::util::GetTensorShape(*input_defs[0]);
+    if (shape.NumDimensions() == 5) {
+      LOGS_DEFAULT(WARNING) << "Not support conv3d yet.";
+      return false;
+    }
+    return true;
+  }
+
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    auto input_tensor = inputs[0];
+    auto weight_tensor = inputs[1];
+    auto OChannel_idx = weight_tensor->GetShape().size() - 1;
+    const bool is_1d_conv =
+        weight_tensor->GetShape().size() == 3 ? true : false;
+    NodeAttrHelper helper(node_unit.GetNode());
+    auto padtype = helper.Get("auto_pad", std::string(""));
+    auto group = helper.Get("group", static_cast<uint32_t>(1));
+
+    std::string op_type = (group != 1 && group == weight_tensor->GetShape()[OChannel_idx])
+                              ? "DepthwiseConv"
+                          : (group != 1) ? "GroupConv"
+                                         : "Conv";
+    op_type += is_1d_conv ? "1D" : "2D";
+    std::string op_name = std::string("Creating ") + op_type + " Op";
+    LOGS_DEFAULT(INFO) << op_name;
+
+    uint32_t default_uint = 1;
+    std::vector<uint32_t> default_vec = {1, 1};
+
+    auto stride =
+        helper.Get("strides", is_1d_conv ? std::vector<uint32_t>{default_uint}
+                                         : default_vec);
+    auto dilation =
+        helper.Get("dilations", is_1d_conv ? std::vector<uint32_t>{default_uint}
+                                           : default_vec);
+
+    std::shared_ptr<tim::vx::Operation> op;
+    if (padtype != "NOTSET") {  // array "pads" is not set
+      if (group != 1 && group != weight_tensor->GetShape()[OChannel_idx]) {
+        if (is_1d_conv) {
+          op = graph_ep->GetGraph()
+                   ->CreateOperation<tim::vx::ops::GroupedConv1d>(
+                       vsi::npu::util::GetPadType(padtype), stride[0],
+                       dilation[0], group, tim::vx::DataLayout::WCN,
+                       tim::vx::DataLayout::WIcOc);
+        } else {
+          op = graph_ep->GetGraph()
+                   ->CreateOperation<tim::vx::ops::GroupedConv2d>(
+                       vsi::npu::util::GetPadType(padtype),
+                       /* W_stride, H_stride*/
+                       std::array<uint32_t, 2>{stride[1], stride[0]},
+                       /* W_dilation, H_dilation*/
+                       std::array<uint32_t, 2>{dilation[1], dilation[0]}, group,
+                       tim::vx::DataLayout::WHCN, tim::vx::DataLayout::WHIcOc);
+        }
+      } else {
+        int32_t multiplier = group == 1
+                                 ? 0
+                                 : weight_tensor->GetShape()[OChannel_idx] / input_tensor->GetShape()[OChannel_idx - 1];
+        if (is_1d_conv) {
+          op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Conv1d>(
+              vsi::npu::util::GetPadType(padtype), stride[0], dilation[0], multiplier,
+              tim::vx::DataLayout::WCN, tim::vx::DataLayout::WIcOc);
+        } else {
+          op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Conv2d>(
+              vsi::npu::util::GetPadType(padtype),
+              /* W_stride, H_stride*/
+              std::array<uint32_t, 2>{stride[1], stride[0]},
+              /* W_dilation, H_dilation*/
+              std::array<uint32_t, 2>{dilation[1], dilation[0]}, multiplier,
+              tim::vx::DataLayout::WHCN, tim::vx::DataLayout::WHIcOc);
+        }
+      }
+    } else {
+      auto pads = helper.Get("pads", std::vector<uint32_t>{0U, 0U});
+      if (group != 1 && group != weight_tensor->GetShape()[OChannel_idx]) {
+        if (is_1d_conv) {
+          op = graph_ep->GetGraph()
+                   ->CreateOperation<tim::vx::ops::GroupedConv1d>(
+                       vsi::npu::util::GetPadType(padtype),
+                       std::array<uint32_t, 2>{pads[0], pads[1]}, stride[0],
+                       dilation[0], group, tim::vx::DataLayout::WCN,
+                       tim::vx::DataLayout::WIcOc);
+        } else {
+          op = graph_ep->GetGraph()
+                   ->CreateOperation<tim::vx::ops::GroupedConv2d>(
+                       /* W_begin,W_end, H_begin,H_end*/ std::array<
+                           uint32_t, 4>{pads[1], pads[3], pads[0], pads[2]},
+                       /* W_stride, H_stide*/
+                       std::array<uint32_t, 2>{stride[1], stride[0]},
+                       /* W_dilation, H_dilation*/
+                       std::array<uint32_t, 2>{dilation[1], dilation[0]}, group,
+                       tim::vx::DataLayout::WHCN, tim::vx::DataLayout::WHIcOc);
+        }
+      } else {
+        int32_t multiplier = group == 1
+                                 ? 0
+                                 : weight_tensor->GetShape()[OChannel_idx] / input_tensor->GetShape()[OChannel_idx - 1];
+        if (is_1d_conv) {
+          op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Conv1d>(
+              std::array<uint32_t, 2>{pads[0], pads[1]}, stride[0], dilation[0],
+              multiplier, tim::vx::DataLayout::WCN, tim::vx::DataLayout::WIcOc);
+        } else {
+          op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Conv2d>(
+              /* W_begin,W_end, H_begin,H_end*/ std::array<uint32_t, 4>{pads[1], pads[3],
+                                                                        pads[0], pads[2]},
+              /* W_stride, H_stride*/
+              std::array<uint32_t, 2>{stride[1], stride[0]},
+              /* W_dilation, H_dilation*/
+              std::array<uint32_t, 2>{dilation[1], dilation[0]}, multiplier,
+              tim::vx::DataLayout::WHCN, tim::vx::DataLayout::WHIcOc);
+        }
+      }
+    }
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/dequantize_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/dequantize_op_builder.h
@ -0,0 +1,83 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+#include "core/providers/common.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class DequantizeLinearOpBuilder : public BaseOpBuilder {
+  enum DequantizeINPUTS {
+    input_tensor = 0,
+    scale_tensor = 1,
+    zero_point_tensor = 2
+  };
+  bool HasSupportedInputOutputsImpl(const InitializedTensorSet& initializers,
+                                    const NodeUnit& node_unit) const override {
+    auto input_type = node_unit.Inputs()[0].node_arg.Type();
+    if (*input_type == "tensor(int64)" || !util::IsTypeSupported(&node_unit.Inputs()[0].node_arg)) {
+      LOGS_DEFAULT(WARNING) << node_unit.OpType() << " has unsupported input type : "
+                            << *input_type;
+      return false;
+    }
+    if (!node_unit.Inputs()[0].quant_param.has_value()) {
+      LOGS_DEFAULT(WARNING) << "The quantization params must be known.";
+      return false;
+    }
+    if (node_unit.Inputs()[0].quant_param->scale.Shape()->dim_size() != 0 &&
+        node_unit.Inputs()[0].quant_param->scale.Shape()->dim(0).dim_value() != 1) {
+      LOGS_DEFAULT(WARNING) << "Per channel quantized input is not support in DequantizeLinear op.";
+      return false;
+    }
+    return true;
+  }
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                     const Node* node) const override {
+    NodeAttrHelper helper(*node);
+    if (helper.HasAttr("block_size") && helper.Get("block_size", 0) != 0) {
+      LOGS_DEFAULT(WARNING) << "Not support block quantization yet.";
+      return false;
+    }
+    return true;
+  }
+
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(INFO) << "Creating Dequantize Op.";
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::DataConvert>();
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h
@ -0,0 +1,98 @@
+
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+#define ELEMENTWISE_OP_BUILDER(onnx_op_type, vsinpu_op_kind)                                     \
+  class onnx_op_type##OpBuilder : public BaseOpBuilder {                                         \
+    bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,                             \
+                       const Node* node) const override {                                        \
+      for (auto input : node->InputDefs()) {                                                     \
+        if (*input->Type() == "tensor(int64)") {                                                 \
+          LOGS_DEFAULT(WARNING) << "Int64 type is not suppoted as elementwise operation input."; \
+          return false;                                                                          \
+        }                                                                                        \
+      }                                                                                          \
+      return true;                                                                               \
+    }                                                                                            \
+    bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,                                              \
+                       std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,                    \
+                       std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,                   \
+                       const NodeUnit& node_unit) override {                                     \
+      LOGS_DEFAULT(INFO) << "Creating " << #onnx_op_type << " Op";                               \
+      auto op = graph_ep->GetGraph() -> CreateOperation<tim::vx::ops::vsinpu_op_kind>();         \
+      (*op).BindInputs(inputs).BindOutputs(outputs);                                             \
+      return true;                                                                               \
+      ;                                                                                          \
+    }                                                                                            \
+  };
+
+ELEMENTWISE_OP_BUILDER(Add, Add);
+ELEMENTWISE_OP_BUILDER(Sub, Sub);
+ELEMENTWISE_OP_BUILDER(Mul, Multiply);
+ELEMENTWISE_OP_BUILDER(Div, Div);  // not consider zero
+ELEMENTWISE_OP_BUILDER(Abs, Abs);
+ELEMENTWISE_OP_BUILDER(Sqrt, Sqrt);
+ELEMENTWISE_OP_BUILDER(Exp, Exp);
+ELEMENTWISE_OP_BUILDER(Floor, Floor);
+ELEMENTWISE_OP_BUILDER(Log, Log);
+ELEMENTWISE_OP_BUILDER(Sin, Sin);
+ELEMENTWISE_OP_BUILDER(HardSwish, HardSwish);
+
+class PowOpBuilder : public BaseOpBuilder {
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                     const Node* node) const override {
+    auto input0_type = *node->InputDefs()[0]->Type();
+    auto input1_type = *node->InputDefs()[1]->Type();
+    if (input0_type != input1_type) {
+      if ((input0_type == "tensor(float)" && input1_type == "tensor(int32)") ||
+          (input0_type == "tensor(int32)" && input1_type == "tensor(float)")) {
+        LOGS_DEFAULT(WARNING) << "Pow op does not support one of input is float32 while the other one is int32 type.";
+        return false;
+      }
+    }
+    return true;
+  }
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(INFO) << "Creating Pow Op";
+    auto op = graph_ep->GetGraph()
+                  ->CreateOperation<tim::vx::ops::Pow>();
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/flatten_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/flatten_op_builder.h
@ -0,0 +1,65 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+#include "core/providers/common.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class FlattenOpBuilder : public BaseOpBuilder {
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating Flatten Op.";
+    std::vector<uint32_t> reshape_param;
+    if (outputs[0]->GetShape().size() == 2) {
+      reshape_param = outputs[0]->GetShape();
+    } else {
+      auto input_shape = inputs[0]->GetShape();
+      NodeAttrHelper helper(node_unit.GetNode());
+      int64_t axis = helper.Get("axis", 1);
+      axis = util::ReverseAxis(static_cast<int32_t>(axis), input_shape.size());
+      uint32_t first_dim = 1;
+      for (int64_t i = 0; i < axis; i++) {
+        first_dim *= inputs[0]->GetShape()[i];
+      }
+      uint32_t second_dim = inputs[0]->GetSpec().GetElementNum() / first_dim;
+      reshape_param.push_back(first_dim);
+      reshape_param.push_back(second_dim);
+    }
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Reshape>(reshape_param);
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/gather_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/gather_op_builder.h
@ -0,0 +1,86 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class GatherOpBuilder : public BaseOpBuilder {
+  bool HasSupportedInputOutputsImpl(const InitializedTensorSet& initializers,
+                                    const NodeUnit& node_unit) const override {
+    auto input = node_unit.Inputs()[0];
+    auto indices = node_unit.Inputs()[1];
+    if (util::IsTypeSupported(&input.node_arg) && util::IsTypeSupported(&indices.node_arg)) {
+      if (*input.node_arg.Type() == "tensor(int64)") {
+        LOGS_DEFAULT(WARNING) << "Only support indices tensor to be int64 type in gather op.";
+        return false;
+      }
+      if (*indices.node_arg.Type() != "tensor(int64)" && *indices.node_arg.Type() != "tensor(int32)") {
+        LOGS_DEFAULT(WARNING) << "Unsupported indices tensor type in gather op.";
+        return false;
+      }
+      if (*indices.node_arg.Type() == "tensor(int64)" && !Contains(initializers, indices.node_arg.Name())) {
+        LOGS_DEFAULT(WARNING) << "Only support const attribute if indice tensor is in int64 type.";
+        return false;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating Gather Op.";
+    NodeAttrHelper helper(node_unit.GetNode());
+    auto axis = helper.Get("axis", 0);
+    axis = util::ReverseAxis(axis, inputs[0]->GetShape().size());
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Gather>(axis, 0);
+
+    bool is_i64_indices = inputs[1]->GetDataType() == tim::vx::DataType::INT64;
+    if (!is_i64_indices) {
+      (*op).BindInputs(inputs).BindOutputs(outputs);
+    } else {
+      std::vector<int64_t> origin_data(inputs[1]->GetSpec().GetElementNum());
+      inputs[1]->CopyDataFromTensor(origin_data.data());
+      std::vector<int32_t> transformed_data(origin_data.begin(), origin_data.end());
+      tim::vx::TensorSpec ts = inputs[1]->GetSpec().SetAttribute(tim::vx::TensorAttribute::INPUT);
+      ts.SetDataType(tim::vx::DataType::INT32);
+      auto transformed_indices = graph_ep->GetGraph()->CreateTensor(ts, transformed_data.data());
+      (*op).BindInput(inputs[0]).BindInput(transformed_indices).BindOutput(outputs[0]);
+    }
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/gemm_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/gemm_op_builder.h
@ -0,0 +1,148 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class GemmOpBuilder : public BaseOpBuilder {
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                     const Node* node) const override {
+    auto input_defs = node->InputDefs();
+    NodeAttrHelper helper(*node);
+    auto weight_units = helper.Get("transB", 0) == 1
+                            ? vsi::npu::util::GetTensorShape(*input_defs[1]).GetDims()[0]
+                            : vsi::npu::util::GetTensorShape(*input_defs[1]).GetDims()[1];
+    if (input_defs.size() > 2) {
+      auto bias_shape = vsi::npu::util::GetTensorShape(*input_defs[2]);
+      if (bias_shape.NumDimensions() == 1 && bias_shape.GetDims()[0] != weight_units) {
+        LOGS_DEFAULT(WARNING) << "Not support to broadcast bias shape.";
+        return false;
+      } else if (bias_shape.NumDimensions() == 2 &&
+                 (bias_shape.Size() != weight_units ||
+                  (bias_shape.GetDims()[0] != 1 && bias_shape.GetDims()[1] != 1))) {
+        LOGS_DEFAULT(WARNING) << "Not support 2-dims bias shape.";
+        return false;
+      }
+
+      if (*input_defs[2]->Type() == "tensor(float16)" &&
+          !graph_viewer.IsConstantInitializer(input_defs[2]->Name(), true)) {
+        LOGS_DEFAULT(WARNING) << "Not support f16 bias with input attr.";
+        return false;
+      }
+    }
+    return true;
+  }
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating Gemm Op.";
+    auto input_A = inputs[0];
+    auto input_B = inputs[1];
+    NodeAttrHelper helper(node_unit.GetNode());
+
+    auto trans_A = helper.Get("transA", 0);
+    auto trans_B = helper.Get("transB", 0);
+    const bool has_alpha = (helper.Get("alpha", 1.0f) != 1.0);
+    const bool has_beta = (helper.Get("beta", 1.0f) != 1.0);
+    const bool has_C = (inputs.size() == 3);
+    auto weight_units = helper.Get("transB", 0) == 1 ? inputs[1]->GetShape()[1] : inputs[1]->GetShape()[0];
+
+    tim::vx::TensorSpec coef_spec(tim::vx::DataType::FLOAT32, {1},
+                                  tim::vx::TensorAttribute::CONSTANT);
+
+    auto multiply_impl = [&](std::shared_ptr<tim::vx::Tensor> input,
+                             std::shared_ptr<tim::vx::Tensor> coef,
+                             std::shared_ptr<tim::vx::Tensor> output) {
+      auto multiply_op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Multiply>();
+      (*multiply_op).BindInput(input).BindInput(coef).BindOutput(output);
+      graph_ep->GetOps().push_back(multiply_op);
+    };
+
+    auto transpose_impl = [&](std::shared_ptr<tim::vx::Tensor> input,
+                              std::shared_ptr<tim::vx::Tensor> output) {
+      std::vector<uint32_t> perm = {1U, 0U};
+      auto transpose_op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Transpose>(perm);
+      (*transpose_op).BindInput(input).BindOutput(output);
+      graph_ep->GetOps().push_back(std::move(transpose_op));
+    };
+
+    auto fc_impl = [&](std::vector<std::shared_ptr<tim::vx::Tensor>> inputs,
+                       std::shared_ptr<tim::vx::Tensor> output) {
+      auto fc_op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::FullyConnected>(0, weight_units);
+      (*fc_op).BindInputs(inputs).BindOutput(output);
+      graph_ep->GetOps().push_back(std::move(fc_op));
+    };
+
+    auto alpha_A = input_A;
+    std::shared_ptr<tim::vx::Tensor> beta_C;
+    auto final_A = input_A;
+    auto final_B = input_B;
+
+    if (has_alpha) {
+      auto alpha_tensor = graph_ep->GetGraph()->CreateTensor(coef_spec);
+      auto alpha = helper.Get("alpha", 1.0f);
+      alpha_tensor->CopyDataToTensor(&alpha);
+      alpha_A = graph_ep->GetGraph()->CreateTensor(
+          input_A->GetSpec().AsTransientSpec());
+      multiply_impl(input_A, alpha_tensor, alpha_A);
+      final_A = alpha_A;
+    }
+    if (has_beta) {
+      auto beta_tensor = graph_ep->GetGraph()->CreateTensor(coef_spec);
+      auto beta = helper.Get("beta", 1.0f);
+      beta_tensor->CopyDataToTensor(&beta);
+      beta_C = graph_ep->GetGraph()->CreateTensor(
+          inputs[2]->GetSpec().AsTransientSpec());
+      multiply_impl(inputs[2], beta_tensor, beta_C);
+    } else if (has_C) {
+      beta_C = inputs[2];
+    }
+
+    if (trans_A) {
+      final_A = graph_ep->GetGraph()->CreateTensor(
+          input_A->GetSpec().AsTransientSpec());
+      transpose_impl(alpha_A, final_A);
+    }
+    if (!trans_B) {
+      final_B = graph_ep->GetGraph()->CreateTensor(
+          input_B->GetSpec().AsTransientSpec());
+      transpose_impl(input_B, final_B);
+    }
+    std::vector<std::shared_ptr<tim::vx::Tensor>> fc_inputs = {final_A, final_B};
+
+    if (has_C) fc_inputs.push_back(beta_C);
+    fc_impl(fc_inputs, outputs[0]);
+
+    return true;
+  }
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/matmul_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/matmul_op_builder.h
@ -0,0 +1,56 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class MatMulOpBuilder : public BaseOpBuilder {
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                     const Node* node) const override {
+    auto output_defs = node->OutputDefs();
+    if (output_defs[0]->Shape()->dim_size() == 0) {
+      LOGS_DEFAULT(WARNING) << "Inner product of 1-D tensor is not supported in MatMul op.";
+      return false;
+    }
+    return true;
+  }
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating Matmul Op.";
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Matmul>();
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/norm_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/norm_op_builder.h
@ -0,0 +1,86 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+
+class BatchNormOpBuilder : public BaseOpBuilder {
+  enum NormINPUTS {
+    input_tensor = 0,
+    scale_tensor = 1,
+    Bias_tensor = 2,
+    mean_tensor = 3,
+    var_tensor = 4
+  };
+  int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const override { return 9; }
+
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                     const Node* node) const override {
+    auto input_defs = node->InputDefs();
+    NodeAttrHelper helper(*node);
+    auto training_mode = helper.Get("training_mode", 0);
+    if (training_mode) {
+      LOGS_DEFAULT(WARNING) << "Training is not supported in batch_norm op.";
+      return false;
+    }
+    if (helper.HasAttr("spatial")) {
+      LOGS_DEFAULT(WARNING) << "VSINPU does not support 'spatial' parameter.";
+      return false;
+    }
+    if (!graph_viewer.IsConstantInitializer(input_defs[NormINPUTS::scale_tensor]->Name(), true)) {
+      LOGS_DEFAULT(WARNING) << "Not support mean/var/gamma/beta set as dynamic input yet.";
+      return false;
+    }
+
+    return true;
+  }
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(INFO) << "Creating BatchNorm Op.";
+    NodeAttrHelper helper(node_unit.GetNode());
+    auto epsilon = helper.Get("epsilon", 1e-5f);
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::BatchNorm>(epsilon);
+    std::vector<std::shared_ptr<tim::vx::Tensor>> reordered_inputs;
+    int indices[] = {NormINPUTS::input_tensor, NormINPUTS::mean_tensor, NormINPUTS::var_tensor,
+                     NormINPUTS::scale_tensor, NormINPUTS::Bias_tensor};
+    for (int i : indices) {
+      reordered_inputs.push_back(inputs[i]);
+    }
+    (*op).BindInputs(reordered_inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/pool_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/pool_op_builder.h
@ -0,0 +1,152 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class BasePoolOpBuilder : public BaseOpBuilder {
+ public:
+  explicit BasePoolOpBuilder(tim::vx::PoolType pool_type) : pool_type_(pool_type) {}
+
+ protected:
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer, const Node* node) const override {
+    auto shape = vsi::npu::util::GetTensorShape(*node->InputDefs()[0]);
+    if (shape.NumDimensions() == 5) {
+      LOGS_DEFAULT(WARNING) << "3DPool is not supported yet.";
+      return false;
+    }
+
+    NodeAttrHelper helper(*node);
+    if (helper.HasAttr("dilations")) {
+      LOGS_DEFAULT(WARNING) << "NonMaxPool with Dilation parameter is not supported.";
+      return false;
+    }
+    return true;
+  }
+  bool CreatePoolingOp(vsi::npu::GraphEP* graph_ep,
+                       std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                       std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                       const std::array<uint32_t, 2>& kernel_size,
+                       const std::array<uint32_t, 2>& strides,
+                       const std::array<uint32_t, 4>& pads,
+                       bool is_global,
+                       const tim::vx::RoundType ceil_mode) {
+    const bool is_1d_pool = inputs[0]->GetShape().size() == 3;
+    std::shared_ptr<tim::vx::Operation> op;
+
+    // Create the appropriate pooling operation
+    if (is_global) {
+      if (is_1d_pool) {
+        op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Pool1d>(pool_type_, inputs[0]->GetShape()[0],
+                                                                         ceil_mode);
+      } else {
+        std::array<uint32_t, 2> input_size = {inputs[0]->GetShape()[0], inputs[0]->GetShape()[1]};
+        op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Pool2d>(pool_type_, input_size, ceil_mode);
+      }
+
+    } else {
+      if (is_1d_pool) {
+        std::array<uint32_t, 2> arr = {pads[2], pads[0]};
+        op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Pool1d>(pool_type_, arr,
+                                                                         kernel_size[1], strides[1], ceil_mode);
+      } else {
+        op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Pool2d>(pool_type_, pads, kernel_size,
+                                                                         strides, ceil_mode);
+      }
+    }
+
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+  tim::vx::PoolType pool_type_;
+};
+
+class TraditionalPoolOpBuilder : public BasePoolOpBuilder {
+ public:
+  TraditionalPoolOpBuilder() : BasePoolOpBuilder(tim::vx::PoolType::MAX) {}
+
+ protected:
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    NodeAttrHelper helper(node_unit.GetNode());
+    auto ksize = helper.Get("kernel_shape", std::vector<uint32_t>{1U, 1U});
+    auto strides = helper.Get("strides", std::vector<uint32_t>{1U, 1U});
+    auto pads = helper.Get("pads", std::vector<uint32_t>{0U, 0U, 0U, 0U});
+    tim::vx::RoundType ceil_mode = helper.Get("ceil_mode", 0U) == 0
+                                       ? tim::vx::RoundType::FLOOR
+                                       : tim::vx::RoundType::CEILING;
+    return CreatePoolingOp(graph_ep, inputs, outputs, {ksize[1], ksize[0]}, {strides[1], strides[0]},
+                           {pads[1], pads[3], pads[0], pads[2]}, false, ceil_mode);
+  }
+};
+
+class GlobalPoolOpBuilder : public BasePoolOpBuilder {
+ public:
+  GlobalPoolOpBuilder() : BasePoolOpBuilder(tim::vx::PoolType::MAX) {}
+
+ protected:
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    NodeAttrHelper helper(node_unit.GetNode());
+    tim::vx::RoundType ceil_mode = helper.Get("ceil_mode", 0U) == 0
+                                       ? tim::vx::RoundType::FLOOR
+                                       : tim::vx::RoundType::CEILING;
+    return CreatePoolingOp(graph_ep, inputs, outputs, {}, {}, {}, true, ceil_mode);
+  }
+};
+
+class GlobalAveragePoolOpBuilder : public GlobalPoolOpBuilder {
+ public:
+  GlobalAveragePoolOpBuilder() { pool_type_ = tim::vx::PoolType::AVG; }
+};
+
+class GlobalMaxPoolOpBuilder : public GlobalPoolOpBuilder {
+ public:
+  GlobalMaxPoolOpBuilder() { pool_type_ = tim::vx::PoolType::MAX; }
+};
+
+class AveragePoolOpBuilder : public TraditionalPoolOpBuilder {
+ public:
+  AveragePoolOpBuilder() { pool_type_ = tim::vx::PoolType::AVG; }
+};
+
+class MaxPoolOpBuilder : public TraditionalPoolOpBuilder {
+ public:
+  MaxPoolOpBuilder() { pool_type_ = tim::vx::PoolType::MAX; }
+};
+
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/qlinear_binary_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/qlinear_binary_op_builder.h
@ -0,0 +1,85 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class BaseQLinearOpBuilder : public BaseOpBuilder {
+  enum {
+    INPUT_A = 0,
+    INPUT_A_SCALE = 1,
+    INPUT_A_ZP = 2,
+    INPUT_B = 3,
+    INPUT_B_SCALE = 4,
+    INPUT_B_ZP = 5,
+    OUTPUT_SCALE = 6,
+    OUTPUT_ZP = 7,
+  };
+
+ protected:
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer, const Node* node) const override {
+    for (int i = 0; i < node->InputDefs().size(); i++) {
+      if (i == INPUT_A || i == INPUT_B) continue;
+      if (!graph_viewer.IsConstantInitializer(node->InputDefs()[i]->Name(), true)) {
+        LOGS_DEFAULT(WARNING) << "Only support const scale / zero point.";
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+class QLinearAddOpBuilder : public BaseQLinearOpBuilder {
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating QLinearAdd Op.";
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Add>();
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+
+class QLinearMulOpBuilder : public BaseQLinearOpBuilder {
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating QLinearMul Op.";
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Multiply>();
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+
+}  // namespace npu
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/qlinearconcat_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/qlinearconcat_op_builder.h
@ -0,0 +1,48 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class QLinearConcatOpBuilder : public BaseOpBuilder {
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep, std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs, const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating QLinearConcat Op.";
+    NodeAttrHelper helper(node_unit.GetNode());
+    int axis = helper.Get("axis", 0);
+    axis = util::ReverseAxis(axis, inputs[0]->GetShape().size());
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Concat>(axis, inputs.size());
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+
+}  // namespace npu
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/qlinearconv_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/qlinearconv_op_builder.h
@ -0,0 +1,151 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <string>
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+#include "core/framework/tensorprotoutils.h"
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class QLinearConvOpBuilder : public BaseOpBuilder {
+  enum QLinearConvINPUTS {
+    INPUT_TENSOR = 0,
+    INPUT_TENSOR_SCALE = 1,
+    INPUT_TENSOR_ZP = 2,
+    WEIGHT_TENSOR = 3,
+    WEIGHT_TENSOR_SCALE = 4,
+    WEIGHT_TENSOR_ZP = 5,
+    OUTPUT_TENSOR_SCALE = 6,
+    OUTPUT_TENSOR_ZP = 7,
+    BIAS_TENSOR = 8,
+  };
+
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                     const Node* node) const override {
+    auto input_defs = node->InputDefs();
+    auto input_shape = vsi::npu::util::GetTensorShape(*input_defs[QLinearConvINPUTS::INPUT_TENSOR]);
+    auto w_scale_shape = vsi::npu::util::GetTensorShape(*input_defs[QLinearConvINPUTS::WEIGHT_TENSOR_SCALE]);
+    auto w_shape_dims = vsi::npu::util::GetTensorShape(*input_defs[QLinearConvINPUTS::WEIGHT_TENSOR]).GetDims();
+    if (input_shape.NumDimensions() != 4) {
+      LOGS_DEFAULT(WARNING) << "Not support conv3d&& conv1d yet.";
+      return false;
+    }
+
+    if (!graph_viewer.IsConstantInitializer(input_defs[QLinearConvINPUTS::INPUT_TENSOR_SCALE]->Name(), true) ||
+        !graph_viewer.IsConstantInitializer(input_defs[WEIGHT_TENSOR]->Name(), true)) {
+      LOGS_DEFAULT(WARNING) << "Not support quantization definitions or weights that are not constant yet.";
+      return false;
+    }
+
+    if (w_shape_dims[2] > 15) {
+      LOGS_DEFAULT(WARNING) << "Not support weight kernel with height higher than 15.";
+      return false;
+    }
+
+    if (w_scale_shape.Size() != 1 && *input_defs[WEIGHT_TENSOR]->Type() == "tensor(int8)") {
+      const ONNX_NAMESPACE::TensorProto* tensor_proto =
+          graph_viewer.GetConstantInitializer(input_defs[QLinearConvINPUTS::WEIGHT_TENSOR_ZP]->Name(), true);
+      std::vector<int8_t> w_zp(tensor_proto->dims_size() == 0 ? 1 : tensor_proto->dims()[0]);
+
+      auto status = onnxruntime::utils::UnpackTensor(
+          *tensor_proto,
+          tensor_proto->has_raw_data() ? tensor_proto->raw_data().data() : nullptr,
+          tensor_proto->has_raw_data() ? tensor_proto->raw_data().size() : 0,
+          w_zp.data(), w_zp.size());
+      if (!status.IsOK()) {
+        LOGS_DEFAULT(ERROR) << "Failed to get data from weight zp tensor.";
+        return false;
+      }
+      if (std::any_of(w_zp.begin(), w_zp.end(), [](int i) { return i != 0; })) {
+        LOGS_DEFAULT(WARNING) << "Asymmetric perchannel quantization only allows uint8 datatype or int8 with all zero.";
+        return false;
+      }
+    }
+    return true;
+  }
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating QLinearConv Op.";
+
+    NodeAttrHelper helper(node_unit.GetNode());
+    auto padtype = helper.Get("auto_pad", std::string(""));
+    auto group = helper.Get("group", static_cast<uint32_t>(1));
+    std::vector<uint32_t> default_vec = {1, 1, 1, 1};
+    auto stride =
+        helper.Get("strides", default_vec);
+    auto dilation =
+        helper.Get("dilations", default_vec);
+    std::shared_ptr<tim::vx::Operation> op;
+    if (padtype != "NOTSET") {  // array "pads" is not set
+      if (group != 1 && group != inputs[1]->GetShape()[3]) {
+        op = graph_ep->GetGraph()
+                 ->CreateOperation<tim::vx::ops::GroupedConv2d>(
+                     vsi::npu::util::GetPadType(padtype),
+                     std::array<uint32_t, 2>{stride[1], stride[0]},
+                     std::array<uint32_t, 2>{dilation[1], dilation[0]}, group,
+                     tim::vx::DataLayout::WHCN, tim::vx::DataLayout::WHIcOc);
+
+      } else {
+        int32_t multiplier = group == 1 ? 0 : inputs[1]->GetShape()[3] / inputs[0]->GetShape()[2];
+        op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Conv2d>(
+            vsi::npu::util::GetPadType(padtype),
+            std::array<uint32_t, 2>{stride[1], stride[0]},
+            std::array<uint32_t, 2>{dilation[1], dilation[0]}, multiplier,
+            tim::vx::DataLayout::WHCN, tim::vx::DataLayout::WHIcOc);
+      }
+    } else {
+      std::vector<uint32_t> default_pads(4, 0);
+      auto pads = helper.Get("pads", default_pads);
+      if (group != 1 && group != inputs[1]->GetShape()[3]) {
+        op = graph_ep->GetGraph()
+                 ->CreateOperation<tim::vx::ops::GroupedConv2d>(
+                     std::array<uint32_t, 4>{pads[1], pads[3], pads[0], pads[2]},
+                     std::array<uint32_t, 2>{stride[1], stride[0]},
+                     std::array<uint32_t, 2>{dilation[1], dilation[0]}, group,
+                     tim::vx::DataLayout::WHCN, tim::vx::DataLayout::WHIcOc);
+
+      } else {
+        int32_t multiplier = group == 1 ? 0 : inputs[1]->GetShape()[3] / inputs[0]->GetShape()[2];
+        op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Conv2d>(
+            std::array<uint32_t, 4>{pads[1], pads[3],
+                                    pads[0], pads[2]},
+            std::array<uint32_t, 2>{stride[1], stride[0]},
+            std::array<uint32_t, 2>{dilation[1], dilation[0]}, multiplier,
+            tim::vx::DataLayout::WHCN, tim::vx::DataLayout::WHIcOc);
+      }
+    }
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/qlinearmatmul_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/qlinearmatmul_op_builder.h
@ -0,0 +1,83 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+
+class QLinearMatMulOpBuilder : public BaseOpBuilder {
+  enum {
+    matrixA = 0,
+    A_scale = 1,
+    A_zero_point = 2,
+    matrixB = 3,
+    B_scale = 4,
+    B_zero_point = 5,
+    out_scale = 6,
+    out_zero_point = 7
+  };
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                     const Node* node) const override {
+    auto input_defs = node->InputDefs();
+    auto A_def = input_defs[matrixA];
+    auto B_def = input_defs[matrixB];
+    for (auto def : input_defs) {
+      if (def->Name() == A_def->Name() || def->Name() == B_def->Name()) {
+        continue;
+      } else {
+        if (!graph_viewer.IsConstantInitializer(def->Name(), true)) {
+          LOGS_DEFAULT(WARNING) << "Scale and zero point must be known before setting graph.";
+          return false;
+        }
+      }
+    }
+    int64_t A_elements = util::GetTensorShape(*input_defs[A_scale]).Size();
+    int64_t B_elements = util::GetTensorShape(*input_defs[B_scale]).Size();
+    int64_t Out_elements = util::GetTensorShape(*input_defs[out_scale]).Size();
+    if (A_elements > 1 || B_elements > 1 || Out_elements > 1) {
+      LOGS_DEFAULT(WARNING) << "Per channel quantized input/output is not supported in QLinearMatmul Op.";
+      return false;
+    }
+
+    return true;
+  }
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(INFO) << "Creating QLinearMatmul Op.";
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Matmul>();
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/quantize_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/quantize_op_builder.h
@ -0,0 +1,79 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+#include "core/providers/common.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+
+class QuantizeLinearOpBuilder : public BaseOpBuilder {
+  enum QuantizeINPUTS {
+    input_tensor = 0,
+    scale_tensor = 1,
+    zero_point_tensor = 2
+  };
+
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                     const Node* node) const override {
+    auto input_defs = node->InputDefs();
+    auto scale_shape = npu::util::GetTensorShape(*input_defs[QuantizeINPUTS::scale_tensor]);
+    NodeAttrHelper helper(*node);
+    if (helper.HasAttr("block_size") && helper.Get("block_size", 0) != 0) {
+      LOGS_DEFAULT(WARNING) << "Not support block quantization.";
+      return false;
+    }
+    if (!graph_viewer.IsConstantInitializer(input_defs[QuantizeINPUTS::scale_tensor]->Name(), true) ||
+        (input_defs.size() == 3 && !graph_viewer.IsConstantInitializer(
+                                       input_defs[QuantizeINPUTS::zero_point_tensor]->Name(), true))) {
+      LOGS_DEFAULT(WARNING) << "Only support const scale / zero point.";
+      return false;
+    }
+
+    if (scale_shape.Size() != 1) {
+      LOGS_DEFAULT(WARNING) << "Per channel quantized output is not supported in QuantizeLinearOp.";
+      return false;
+    }
+    return true;
+  }
+
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(INFO) << "Creating Quantize Op.";
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::DataConvert>();
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/reduce_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/reduce_op_builder.h
@ -0,0 +1,82 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class ReduceMeanOpBuilder : public BaseOpBuilder {
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                     const Node* node) const override {
+    auto input_defs = node->InputDefs();
+    if (*input_defs[0]->Type() == "tensor(int32)") {
+      LOGS_DEFAULT(WARNING) << "Not support int32 reduce mean yet.";
+      return false;
+    }
+    return true;
+  }
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(INFO) << "Creating ReduceMean Op.";
+
+    NodeAttrHelper helper(node_unit.GetNode());
+    std::vector<int64_t> def_axes;
+    auto input_shape_size = inputs[0]->GetShape().size();
+
+    if (node_unit.SinceVersion() < 18 && helper.HasAttr("axes")) {
+      def_axes = helper.Get("axes", def_axes);
+    } else if (inputs.size() > 1) {
+      def_axes.resize(inputs[1]->GetSpec().GetElementNum());
+      inputs[1]->CopyDataFromTensor(def_axes.data());
+    } else {
+      for (int64_t i = 0; i < input_shape_size; ++i) {
+        def_axes.push_back(i);
+      }
+    }
+
+    std::vector<int32_t> axes(def_axes.begin(), def_axes.end());
+    axes = util::ReverseAxis(axes, input_shape_size);
+
+    if (helper.HasAttr("noop_with_empty_axes") && inputs.size() == 1 && helper.Get("noop_with_empty_axes", 0) == 1) {
+      outputs[0] = inputs[0];
+      return true;
+    }
+
+    bool keepdims = helper.Get("keepdims", 1) == 1;
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::ReduceMean>(axes, keepdims);
+    (*op).BindInput(inputs[0]).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/resize_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/resize_op_builder.h
@ -0,0 +1,153 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class ResizeOpBuilder : public BaseOpBuilder {
+  bool HasSupportedInputOutputsImpl(const InitializedTensorSet& initializers,
+                                    const NodeUnit& node_unit) const override {
+    auto input_type = node_unit.Inputs()[0].node_arg.Type();
+    if (*input_type == "tensor(int64)" || !util::IsTypeSupported(&node_unit.Inputs()[0].node_arg)) {
+      LOGS_DEFAULT(WARNING) << node_unit.OpType() << " has unsupported input type : "
+                            << *input_type;
+      return false;
+    }
+    if (node_unit.SinceVersion() > 10) {
+      if (node_unit.Inputs().size() > 2 && !Contains(initializers, node_unit.Inputs()[2].node_arg.Name())) {
+        LOGS_DEFAULT(WARNING) << "Scale tensor must be constant.";
+        return false;
+      }
+      if (node_unit.Inputs().size() > 3 && !Contains(initializers, node_unit.Inputs()[3].node_arg.Name())) {
+        LOGS_DEFAULT(WARNING) << "Size tensor must be constant.";
+        return false;
+      }
+    } else {
+      if (!Contains(initializers, node_unit.Inputs()[1].node_arg.Name())) {
+        LOGS_DEFAULT(WARNING) << "Scale tensor must be constant.";
+        return false;
+      }
+    }
+    return true;
+  }
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer, const Node* node) const override {
+    auto shape = vsi::npu::util::GetTensorShape(*node->InputDefs()[0]);
+    if (shape.NumDimensions() > 4) {
+      LOGS_DEFAULT(WARNING) << "3D or more dimesions resize is not supported.";
+      return false;
+    }
+
+    NodeAttrHelper helper(*node);
+    if (helper.Get("antialiax", 0) != 0) {
+      LOGS_DEFAULT(WARNING) << "Antialias attribute is not supported.";
+      return false;
+    }
+    auto& cooridinate = helper.Get("coordinate_transoformation_mode", "half_pixel");
+    if (cooridinate != "align_corners" && cooridinate != "half_pixel") {
+      LOGS_DEFAULT(WARNING) << "Only support half_pixel and align_corners attributes now.";
+      return false;
+    }
+    if (helper.Get("keep_aspect_ratio_policy", "stretch") != "stretch") {
+      LOGS_DEFAULT(WARNING) << "Not support to keep aspect ratio.";
+      return false;
+    }
+    if (helper.Get("mode", "nearest") == "cubic") {
+      LOGS_DEFAULT(WARNING) << "Not support the cubic resize type yet.";
+      return false;
+    }
+    if (helper.HasAttr("axes")) {
+      LOGS_DEFAULT(WARNING) << "Axes-specifying is not support.";
+      return false;
+    }
+    return true;
+  }
+
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating Resize Op.";
+    auto inputs_num = inputs.size();
+    bool is_1dresize = inputs[0]->GetShape().size() == 1;
+    NodeAttrHelper helper(node_unit.GetNode());
+    auto onnx_mode = helper.Get("mode", "nearest");
+    auto coordinate_transformation = helper.Get("coordinate_transformation_mode", "half_pixel");
+    bool is_size_set = helper.HasAttr("size");
+    int32_t scale_index = node_unit.SinceVersion() > 10 ? 2 : 1;
+
+    auto resize_type = onnx_mode == "nearest" ? tim::vx::ResizeType::NEAREST_NEIGHBOR : tim::vx::ResizeType::BILINEAR;
+    bool align_corners = coordinate_transformation == "align_corners";
+    bool half_pixel_center = coordinate_transformation == "half_pixel";
+    std::shared_ptr<tim::vx::Operation> op = nullptr;
+    if (is_1dresize) {
+      int target_size;
+      if (is_size_set) {
+        int64_t onnx_size;
+        inputs[3]->CopyDataFromTensor(&onnx_size);
+        target_size = static_cast<int>(onnx_size);
+        op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Resize1d>(resize_type, 0.0f, align_corners,
+                                                                           half_pixel_center, target_size);
+      } else {
+        float scale;
+        inputs[scale_index]->CopyDataFromTensor(&scale);
+        op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Resize1d>(resize_type, scale, align_corners,
+                                                                           half_pixel_center, 0);
+      }
+    } else {
+      int target_h, target_w;
+      if (is_size_set) {
+        std::vector<int64_t> onnx_sizes(inputs[3]->GetShape().size());
+        inputs[3]->CopyDataFromTensor(onnx_sizes.data());
+        target_h = static_cast<int>(onnx_sizes[1]);
+        target_w = static_cast<int>(onnx_sizes[0]);
+        op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Resize>(resize_type, 0.0f, align_corners,
+                                                                         half_pixel_center, target_h, target_w);
+      } else {
+        auto input_shape = inputs[0]->GetShape();
+        std::vector<float> scales(input_shape.size());
+        std::vector<uint32_t> out_shape(input_shape.size());
+        inputs[scale_index]->CopyDataFromTensor(scales.data());
+        for (int i = 0; i < input_shape.size(); i++) {
+          out_shape[i] = input_shape[i] * scales[input_shape.size() - 1 - i];
+        }
+        op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Resize>(resize_type, 0, align_corners,
+                                                                         half_pixel_center, out_shape[1], out_shape[0]);
+      }
+    }
+
+    (*op).BindInput(inputs[0]).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/softmax_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/softmax_op_builder.h
@ -0,0 +1,101 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include <functional>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+#include "core/providers/common.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class SoftmaxOpBuilder : public BaseOpBuilder {
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                     const Node* node) const override {
+    NodeAttrHelper helper(*node);
+    auto axis = helper.Get("axis", -1);
+    auto input_defs = node->InputDefs();
+    auto input_shape = vsi::npu::util::GetTensorShape(*input_defs[0]);
+    int32_t rank = input_shape.NumDimensions();
+    if (axis >= rank || axis < -rank) {
+      LOGS_DEFAULT(ERROR) << "Axis is invalid in Softmax.";
+      return false;
+    }
+    return true;
+  }
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating Softmax Op.";
+    NodeAttrHelper helper(node_unit.GetNode());
+    int32_t def_val = node_unit.SinceVersion() < 13 ? 1 : -1;
+    auto axis = helper.Get("axis", def_val);
+
+    if (def_val == 1) {
+      // In earlier opset version of softmax, input is coerced into 2D shape
+      // Attribute "axis" is to describe the axis of the inputs coerced to 2D but not take part in softmax computation
+      const bool is_2d_shape = inputs[0]->GetShape().size() == 2 ? true : false;
+      if (!is_2d_shape) {
+        axis = HandleNegativeAxis(axis, inputs[0]->GetShape().size());
+        auto it = inputs[0]->GetShape().end();
+        uint32_t last_dim = std::accumulate(it - axis, it, 1, std::multiplies<uint32_t>());
+        uint32_t first_dim = std::accumulate(inputs[0]->GetShape().begin(), it - axis, 1, std::multiplies<uint32_t>());
+        auto reshaped_spec = inputs[0]->GetSpec().AsTransientSpec().SetShape(
+            std::vector<uint32_t>{first_dim, last_dim});
+        auto reshaped_input = graph_ep->GetGraph()->CreateTensor(reshaped_spec);
+        auto reshaped_output = graph_ep->GetGraph()->CreateTensor(inputs[0]->GetSpec().AsTransientSpec());
+
+        auto reshape_input_op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Reshape>(
+            std::vector<uint32_t>{first_dim, last_dim});
+        auto softmax_op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Softmax>(1, 0);
+        auto reshaped_output_op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Reshape>(inputs[0]->GetShape());
+
+        (*reshape_input_op).BindInputs(inputs).BindOutput(reshaped_input);
+        (*softmax_op).BindInput(reshaped_input).BindOutput(reshaped_output);
+        (*reshaped_output_op).BindInput(reshaped_output).BindOutputs(outputs);
+
+        graph_ep->GetOps().push_back(std::move(reshape_input_op));
+        graph_ep->GetOps().push_back(std::move(softmax_op));
+        graph_ep->GetOps().push_back(std::move(reshaped_output_op));
+      } else {
+        auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Softmax>(1, 0);
+        (*op).BindInputs(inputs).BindOutputs(outputs);
+        graph_ep->GetOps().push_back(std::move(op));
+      }
+    } else {
+      axis = util::ReverseAxis(axis, inputs[0]->GetShape().size());
+      auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Softmax>(1, static_cast<uint32_t>(axis));
+      (*op).BindInputs(inputs).BindOutputs(outputs);
+      graph_ep->GetOps().push_back(std::move(op));
+    }
+    return true;
+  }
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/squeeze_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/squeeze_op_builder.h
@ -0,0 +1,88 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class SqueezeOpBuilder : public BaseOpBuilder {
+  bool HasSupportedInputOutputsImpl(const InitializedTensorSet& initializers,
+                                    const NodeUnit& node_unit) const override {
+    auto input_type = node_unit.Inputs()[0].node_arg.Type();
+    if (*input_type == "tensor(int64)" || !util::IsTypeSupported(&node_unit.Inputs()[0].node_arg)) {
+      LOGS_DEFAULT(WARNING) << node_unit.OpType() << " has unsupported input type : "
+                            << *input_type;
+      return false;
+    }
+    if (node_unit.SinceVersion() > 11) {
+      if (node_unit.Inputs().size() > 1 && !Contains(initializers, node_unit.Inputs()[1].node_arg.Name())) {
+        LOGS_DEFAULT(WARNING) << "Only support const axes in Squeeze op.";
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(INFO) << "Creating Squeeze Op.";
+
+    NodeAttrHelper helper(node_unit.GetNode());
+    std::vector<int64_t> def_axes;
+    auto input_shape_size = inputs[0]->GetShape().size();
+
+    if (node_unit.SinceVersion() < 13 && helper.HasAttr("axes")) {
+      def_axes = helper.Get("axes", def_axes);
+    } else if (inputs.size() > 1) {
+      def_axes.resize(inputs[1]->GetSpec().GetElementNum());
+      inputs[1]->CopyDataFromTensor(def_axes.data());
+    } else {  // if axes is empty from onnx, check input shape to determine
+      for (int64_t i = 0; i < input_shape_size; ++i) {
+        if (inputs[0]->GetShape()[i] == 1) {
+          def_axes.push_back(i);
+        }
+      }
+    }
+
+    std::vector<int32_t> axes(def_axes.begin(), def_axes.end());
+    axes = util::ReverseAxis(axes, input_shape_size);
+
+    std::vector<uint32_t> timvx_axes(axes.begin(), axes.end());
+
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Squeeze>(timvx_axes);
+    (*op).BindInput(inputs[0]).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/tensor_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/tensor_op_builder.h
@ -0,0 +1,142 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class ReshapeOpBuilder : public BaseOpBuilder {
+  int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const override { return 5; }
+
+  bool HasSupportedInputOutputsImpl(const InitializedTensorSet& initializers,
+                                    const NodeUnit& node_unit) const override {
+    auto input = node_unit.Inputs()[0];
+    auto shape = node_unit.Inputs()[1];
+    if (initializers.end() == initializers.find(shape.node_arg.Name())) {
+      LOGS_DEFAULT(VERBOSE) << "Target shape of reshape op must be known.";
+      return false;
+    }
+    if (util::IsTypeSupported(&input.node_arg) && util::IsTypeSupported(&shape.node_arg)) {
+      if (*input.node_arg.Type() != "tensor(int64)") {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                     const Node* node) const override {
+    auto input_defs = node->InputDefs();
+
+    NodeAttrHelper helper(*node);
+    const bool allow_zero = helper.Get("allowzero", 0) == 1;
+    auto& perm_tensor_proto = *graph_viewer.GetConstantInitializer(input_defs[1]->Name(), true);
+    std::vector<int64_t> perm(perm_tensor_proto.dims()[0]);
+    auto status = onnxruntime::utils::UnpackTensor(
+        perm_tensor_proto,
+        perm_tensor_proto.has_raw_data() ? perm_tensor_proto.raw_data().data() : nullptr,
+        perm_tensor_proto.has_raw_data() ? perm_tensor_proto.raw_data().size() : 0,
+        perm.data(), perm.size());
+
+    // Check if perm has any 0's when allow zero is enabled.
+    if (allow_zero && std::find(perm.begin(), perm.end(), 0L) != perm.end()) {
+      LOGS_DEFAULT(VERBOSE) << "Reshape doesn't support 0 as dimension when allowzero is enabled";
+      return false;
+    }
+
+    return true;
+  }
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating Reshape Op.";
+    std::vector<int64_t> new_shape(inputs[1]->GetShape()[0]);
+    inputs[1]->CopyDataFromTensor(new_shape.data());
+    for (size_t i = 0; i < new_shape.size(); i++) {
+      if (new_shape[i] == 0) {
+        new_shape[i] = inputs[0]->GetShape()[inputs[0]->GetShape().size() - i - 1];
+      }
+    }
+
+    int64_t element_count = std::accumulate(new_shape.begin(), new_shape.end(), static_cast<int64_t>(1),
+                                            [&](int64_t a, int64_t b) {
+                                              return b == -1 ? a : a * b;
+                                            });
+    auto negative_it = std::find(new_shape.begin(), new_shape.end(), -1);
+    if (negative_it != new_shape.end()) {
+      *negative_it = inputs[0]->GetSpec().GetElementNum() / element_count;
+    }
+
+    std::vector<uint32_t> new_shape_uint32(new_shape.begin(), new_shape.end());
+    std::reverse(new_shape_uint32.begin(), new_shape_uint32.end());
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Reshape>(new_shape_uint32);
+    (*op).BindInput(inputs[0]).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+
+class TransposeOpBuilder : public BaseOpBuilder {
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                     const Node* node) const override {
+    auto input_defs = node->InputDefs();
+    auto shape_dim = vsi::npu::util::GetTensorShape(*input_defs[0]).NumDimensions();
+    NodeAttrHelper helper(*node);
+    auto perm = helper.Get("perm", std::vector<uint32_t>(shape_dim, 1));
+    if (perm.size() != shape_dim) {
+      LOGS_DEFAULT(VERBOSE) << "Size mismatch between perm vector and input shape.";
+      return false;
+    }
+    return true;
+  }
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating Transpose Op.";
+    std::vector<int64_t> def_val(inputs[0]->GetShape().size());
+    for (int64_t i = 0; i < def_val.size(); i++) def_val[i] = def_val.size() - i - 1;
+
+    NodeAttrHelper helper(node_unit.GetNode());
+    def_val = helper.Get("perm", def_val);
+    std::vector<uint32_t> timvx_perm;
+    for (uint32_t i = 0; i < def_val.size(); i++) {
+      timvx_perm.push_back(def_val.size() - 1 - def_val[def_val.size() - i - 1]);
+    }
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Transpose>(timvx_perm);
+    (*op).BindInputs(inputs).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/tile_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/tile_op_builder.h
@ -0,0 +1,71 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class TileOpBuilder : public BaseOpBuilder {
+  int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const override { return 6; }
+
+  bool HasSupportedInputOutputsImpl(const InitializedTensorSet& initializers,
+                                    const NodeUnit& node_unit) const override {
+    auto input = node_unit.Inputs()[0];
+    auto multipliers = node_unit.Inputs()[1];
+    if (initializers.end() == initializers.find(multipliers.node_arg.Name())) {
+      LOGS_DEFAULT(WARNING) << "Multipliers of tile op must be known.";
+      return false;
+    }
+    if (util::IsTypeSupported(&input.node_arg) && util::IsTypeSupported(&multipliers.node_arg)) {
+      if (*input.node_arg.Type() != "tensor(int64)") {
+        return true;
+      }
+    }
+    LOGS_DEFAULT(WARNING) << "Input type not supported.";
+    return false;
+  }
+
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating Tile Op.";
+    std::vector<int64_t> multipliers(inputs[1]->GetShape()[0]);
+    inputs[1]->CopyDataFromTensor(multipliers.data());
+    std::reverse(multipliers.begin(), multipliers.end());
+    std::vector<int32_t> timvx_multipliers(multipliers.begin(), multipliers.end());
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Tile>(timvx_multipliers);
+    (*op).BindInput(inputs[0]).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/impl/unsqueeze_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/unsqueeze_op_builder.h
@ -0,0 +1,89 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <memory>
+#include <vector>
+#include <utility>
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class UnsqueezeOpBuilder : public BaseOpBuilder {
+  bool HasSupportedInputOutputsImpl(const InitializedTensorSet& initializers,
+                                    const NodeUnit& node_unit) const override {
+    auto input_type = node_unit.Inputs()[0].node_arg.Type();
+    if (*input_type == "tensor(int64)" || !util::IsTypeSupported(&node_unit.Inputs()[0].node_arg)) {
+      LOGS_DEFAULT(WARNING) << node_unit.OpType() << " has unsupported input type : "
+                            << *input_type;
+      return false;
+    }
+    if (node_unit.SinceVersion() > 11 && !Contains(initializers, node_unit.Inputs()[1].node_arg.Name())) {
+      LOGS_DEFAULT(WARNING) << "Only support const axes in Unsqueeze op.";
+      return false;
+    }
+    return true;
+  }
+
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(INFO) << "Creating Unsqueeze Op.";
+
+    NodeAttrHelper helper(node_unit.GetNode());
+    std::vector<int64_t> def_axes;
+    auto input_shape_size = inputs[0]->GetShape().size();
+
+    if (node_unit.SinceVersion() < 13 && helper.HasAttr("axes")) {
+      def_axes = helper.Get("axes", def_axes);
+    } else if (inputs.size() > 1) {
+      def_axes.resize(inputs[1]->GetSpec().GetElementNum());
+      inputs[1]->CopyDataFromTensor(def_axes.data());
+    } else {  // if axes is empty from onnx, check input shape to determine
+      for (int64_t i = 0; i < input_shape_size; ++i) {
+        if (inputs[0]->GetShape()[i] == 1) {
+          def_axes.push_back(i);
+        }
+      }
+    }
+
+    std::vector<int32_t> axes(def_axes.begin(), def_axes.end());
+    axes = util::ReverseAxis(axes, input_shape_size + axes.size());
+
+    std::vector<uint32_t> timvx_axes(inputs[0]->GetShape().begin(), inputs[0]->GetShape().end());
+    for (int32_t dim : axes) {
+      timvx_axes.insert(timvx_axes.begin() + dim, 1);
+    }
+
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Reshape>(timvx_axes);
+    (*op).BindInput(inputs[0]).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/op_builder.h
@ -0,0 +1,48 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#pragma once
+#include "core/graph/graph_viewer.h"
+#include "core/framework/node_unit.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+class GraphEP;
+
+class IOpBuilder {
+ public:
+  IOpBuilder() {}
+  virtual ~IOpBuilder() {}
+  virtual bool IsSupported(const onnxruntime::GraphViewer& graph_viewer,
+                           const NodeUnit& node_unit) const {
+    return true;
+  }
+  virtual bool BuildOp(GraphEP* graph_ep,
+                       const onnxruntime::GraphViewer& graph_viewer,
+                       const NodeUnit& node_unit) = 0;
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/builders/op_builder_factory.h
+++ b/onnxruntime/core/providers/vsinpu/builders/op_builder_factory.h
@ -0,0 +1,133 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#pragma once
+#include <string>
+#include <memory>
+#include <map>
+#include <utility>
+#include "impl/activation_op_builder.h"
+#include "impl/conv_op_builder.h"
+#include "impl/elementwise_op_builder.h"
+#include "impl/gemm_op_builder.h"
+#include "impl/pool_op_builder.h"
+#include "impl/qlinearconv_op_builder.h"
+#include "impl/flatten_op_builder.h"
+#include "impl/matmul_op_builder.h"
+#include "impl/tensor_op_builder.h"
+#include "impl/concat_op_builder.h"
+#include "impl/softmax_op_builder.h"
+#include "impl/norm_op_builder.h"
+#include "impl/clip_op_builder.h"
+#include "impl/reduce_op_builder.h"
+#include "impl/quantize_op_builder.h"
+#include "impl/dequantize_op_builder.h"
+#include "impl/qlinearmatmul_op_builder.h"
+#include "impl/qlinear_binary_op_builder.h"
+#include "impl/qlinearconcat_op_builder.h"
+#include "impl/gather_op_builder.h"
+#include "impl/tile_op_builder.h"
+#include "impl/squeeze_op_builder.h"
+#include "impl/unsqueeze_op_builder.h"
+#include "impl/resize_op_builder.h"
+#include "impl/cast_op_builder.h"
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+using createIOpBuildItemFunc = std::function<std::unique_ptr<IOpBuilder>()>;
+using OpBuildItemType = std::map<std::string, std::unique_ptr<IOpBuilder>>;
+
+static const std::map<std::string, createIOpBuildItemFunc> reg = {
+#define REGISTER_OP_BUILDER(ONNX_NODE_TYPE, BUILDER_TYPE)           \
+  {                                                                 \
+    ONNX_NODE_TYPE, [] { return std::make_unique<BUILDER_TYPE>(); } \
+  }
+
+    REGISTER_OP_BUILDER("Add", AddOpBuilder),
+    REGISTER_OP_BUILDER("Sub", SubOpBuilder),
+    REGISTER_OP_BUILDER("Mul", MulOpBuilder),
+    REGISTER_OP_BUILDER("Div", DivOpBuilder),
+    REGISTER_OP_BUILDER("Abs", AbsOpBuilder),
+    REGISTER_OP_BUILDER("Pow", PowOpBuilder),
+    REGISTER_OP_BUILDER("Sqrt", SqrtOpBuilder),
+    REGISTER_OP_BUILDER("Exp", ExpOpBuilder),
+    REGISTER_OP_BUILDER("Floor", FloorOpBuilder),
+    REGISTER_OP_BUILDER("Log", LogOpBuilder),
+    REGISTER_OP_BUILDER("Sin", SinOpBuilder),
+    REGISTER_OP_BUILDER("Conv", ConvOpBuilder),
+    REGISTER_OP_BUILDER("Gemm", GemmOpBuilder),
+    REGISTER_OP_BUILDER("Relu", ReluOpBuilder),
+    REGISTER_OP_BUILDER("LeakyRelu", LeakyReluOpBuilder),
+    REGISTER_OP_BUILDER("Tanh", TanhOpBuilder),
+    REGISTER_OP_BUILDER("Sigmoid", SigmoidOpBuilder),
+    REGISTER_OP_BUILDER("HardSigmoid", HardSigmoidOpBuilder),
+    REGISTER_OP_BUILDER("HardSwish", HardSwishOpBuilder),
+    REGISTER_OP_BUILDER("GlobalAveragePool", GlobalAveragePoolOpBuilder),
+    REGISTER_OP_BUILDER("QLinearConv", QLinearConvOpBuilder),
+    REGISTER_OP_BUILDER("Flatten", FlattenOpBuilder),
+    REGISTER_OP_BUILDER("MatMul", MatMulOpBuilder),
+    REGISTER_OP_BUILDER("GlobalMaxPool", GlobalMaxPoolOpBuilder),
+    REGISTER_OP_BUILDER("AveragePool", AveragePoolOpBuilder),
+    REGISTER_OP_BUILDER("MaxPool", MaxPoolOpBuilder),
+    REGISTER_OP_BUILDER("Reshape", ReshapeOpBuilder),
+    REGISTER_OP_BUILDER("Concat", ConcatOpBuilder),
+    REGISTER_OP_BUILDER("Softmax", SoftmaxOpBuilder),
+    REGISTER_OP_BUILDER("Transpose", TransposeOpBuilder),
+    REGISTER_OP_BUILDER("BatchNormalization", BatchNormOpBuilder),
+    REGISTER_OP_BUILDER("Clip", ClipOpBuilder),
+    REGISTER_OP_BUILDER("ReduceMean", ReduceMeanOpBuilder),
+    REGISTER_OP_BUILDER("QuantizeLinear", QuantizeLinearOpBuilder),
+    REGISTER_OP_BUILDER("DequantizeLinear", DequantizeLinearOpBuilder),
+    REGISTER_OP_BUILDER("QLinearMatMul", QLinearMatMulOpBuilder),
+    REGISTER_OP_BUILDER("QLinearAdd", QLinearAddOpBuilder),
+    REGISTER_OP_BUILDER("QLinearMul", QLinearMulOpBuilder),
+    REGISTER_OP_BUILDER("QLinearConcat", QLinearConcatOpBuilder),
+    REGISTER_OP_BUILDER("Gather", GatherOpBuilder),
+    REGISTER_OP_BUILDER("Tile", TileOpBuilder),
+    REGISTER_OP_BUILDER("Squeeze", SqueezeOpBuilder),
+    REGISTER_OP_BUILDER("Unsqueeze", UnsqueezeOpBuilder),
+    REGISTER_OP_BUILDER("Resize", ResizeOpBuilder),
+    REGISTER_OP_BUILDER("Cast", CastOpBuilder),
+
+#undef REGISTER_OP_BUILDER
+};
+
+template <typename T>
+struct OpBuildConstructor {
+  T supported_builtins;
+  OpBuildConstructor(
+      const std::map<typename T::key_type, createIOpBuildItemFunc> reg) {
+    LOGS_DEFAULT(INFO) << "Initialize supported ops";
+    for (const auto& kv : reg) {
+      supported_builtins.insert(std::make_pair(kv.first, kv.second()));
+    }
+  }
+};
+
+inline const OpBuildItemType& SupportedBuiltinOps() {
+  static OpBuildConstructor<OpBuildItemType> c(reg);
+  return c.supported_builtins;
+}
+}  // namespace npu
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/patches/AccuracyCorrection.patch
+++ b/onnxruntime/core/providers/vsinpu/patches/AccuracyCorrection.patch
@ -0,0 +1,26 @@
+diff --git a/onnxruntime/test/providers/checkers.cc b/onnxruntime/test/providers/checkers.cc
+index 47c18c478d..93b44501cd 100644
+--- a/onnxruntime/test/providers/checkers.cc
+++ b/onnxruntime/test/providers/checkers.cc
+@@ -195,7 +195,7 @@ struct TensorCheck<uint8_t> {
+     // For any other EPs, we still expect an exact match for the results
+     // TODO: Verify if DML can possibly have a ROUNDING_MODE parameter and conform to the other EPs #41968513
+     if ((provider_type == kNnapiExecutionProvider || provider_type == kDmlExecutionProvider ||
+-         provider_type == kXnnpackExecutionProvider) &&
+         provider_type == kXnnpackExecutionProvider || provider_type == kVSINPUExecutionProvider) &&
+         (has_abs_err || has_rel_err)) {
+       double threshold = has_abs_err ? *(params.absolute_error)
+                                      : 0.0;
+diff --git a/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc b/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc
+index 2bc0df5e36..7beb78c2ff 100644
+--- a/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc
+@@ -498,7 +498,7 @@ class QLinearConvOpTester {
+     // NOTE, for now the tolerance will only apply if the NNAPI is actually used,
+     // if for any reason the execution falls back to CPU, we still expect an exact match
+     // See, 'void Check<uint8_t>(...' in onnxruntime/test/providers/provider_test_utils.cc
+-#if defined(USE_NNAPI) || defined(USE_DML)
+#if defined(USE_NNAPI) || defined(USE_DML) || defined(USE_VSINPU)
+     // TODO: Verify if DML can possibly have a ROUNDING_MODE parameter and conform to the other EPs #41968513
+     abs_error = 1.0f;
+ #endif
--- a/onnxruntime/core/providers/vsinpu/patches/local_testing_record_res.patch
+++ b/onnxruntime/core/providers/vsinpu/patches/local_testing_record_res.patch
@ -0,0 +1,343 @@
+diff --git a/onnxruntime/test/onnx/dataitem_request.cc b/onnxruntime/test/onnx/dataitem_request.cc
+index 1ee302d5d5..5c2dd5ab00 100644
+--- a/onnxruntime/test/onnx/dataitem_request.cc
+++ b/onnxruntime/test/onnx/dataitem_request.cc
+@@ -135,6 +135,7 @@ std::pair<EXECUTE_RESULT, TIME_SPEC> DataTaskRequestContext::RunImpl() {
+   }
+ 
+   EXECUTE_RESULT res = EXECUTE_RESULT::SUCCESS;
+  int32_t out_idx = 0;
+   for (auto& output : expected_output_values) {
+     const std::string& output_name = output.first;
+     OrtValue* expected_output_value = output.second;  // Automatic cast
+@@ -170,7 +171,7 @@ std::pair<EXECUTE_RESULT, TIME_SPEC> DataTaskRequestContext::RunImpl() {
+       } else {  // Both expect and actual OrtValues are not None, proceed with data checking
+         ret =
+             CompareOrtValue(*actual_output_value, *expected_output_value, per_sample_tolerance,
+-                            relative_per_sample_tolerance, post_procesing);
+                            relative_per_sample_tolerance, post_procesing, out_idx);
+       }
+     } else {  // Expected output is None, ensure that the received output OrtValue is None as well
+       if (actual_output_value->IsAllocated()) {
+@@ -223,9 +224,10 @@ std::pair<EXECUTE_RESULT, TIME_SPEC> DataTaskRequestContext::RunImpl() {
+     if (compare_result != COMPARE_RESULT::SUCCESS && !ret.second.empty()) {
+       LOGS_DEFAULT(ERROR) << test_case_.GetTestCaseName() << ":output=" << output_name << ":" << ret.second;
+     }
+-    if (compare_result != COMPARE_RESULT::SUCCESS) {
+-      break;
+-    }
+    // if (compare_result != COMPARE_RESULT::SUCCESS) {
+    //   break;
+    // }
+    out_idx ++;
+   }
+   return std::make_pair(res, spent_time_);
+ }
+diff --git a/onnxruntime/test/providers/checkers.cc b/onnxruntime/test/providers/checkers.cc
+index f1a7240ea3..436031dfa8 100644
+--- a/onnxruntime/test/providers/checkers.cc
+++ b/onnxruntime/test/providers/checkers.cc
+@@ -154,6 +154,7 @@ struct TensorCheck<int8_t> {
+     }
+ 
+     const bool has_abs_err = params.absolute_error.has_value();
+    const int8_t default_abs_err = 1;
+     if (has_abs_err) {
+       double threshold = *(params.absolute_error);
+ 
+@@ -162,7 +163,8 @@ struct TensorCheck<int8_t> {
+       }
+     } else {
+       for (int i = 0; i < size; ++i) {
+-        EXPECT_EQ(cur_expected[i], cur_actual[i]) << "i:" << i;
+        // EXPECT_EQ(cur_expected[i], cur_actual[i]) << "i:" << i;
+        EXPECT_NEAR(cur_expected[i], cur_actual[i], default_abs_err) << "i:" << i;
+       }
+     }
+   }
+diff --git a/onnxruntime/test/util/compare_ortvalue.cc b/onnxruntime/test/util/compare_ortvalue.cc
+index 3d53d4a3a0..8129af1820 100644
+--- a/onnxruntime/test/util/compare_ortvalue.cc
+++ b/onnxruntime/test/util/compare_ortvalue.cc
+@@ -138,11 +138,75 @@ std::pair<COMPARE_RESULT, std::string> CompareFloatResult(const Tensor& outvalue
+   return res;
+ }
+ 
+template <typename FLOAT_TYPE>
+std::pair<COMPARE_RESULT, std::string> CompareFloatResult(const Tensor& outvalue, const Tensor& expected_value,
+                                                          double per_sample_tolerance,
+                                                          double relative_per_sample_tolerance, bool post_processing, int32_t out_idx) {
+  const size_t size1 = static_cast<size_t>(expected_value.Shape().Size());
+  const FLOAT_TYPE* expected_output = expected_value.Data<FLOAT_TYPE>();
+  const FLOAT_TYPE* real_output = outvalue.Data<FLOAT_TYPE>();
+
+  std::string expected_name = "expected_res"+ std::to_string(out_idx)+ ".txt";
+  std::string npures_name = "npu_res"+ std::to_string(out_idx)+ ".txt";
+  std::ofstream expected_res(expected_name), npu_res(npures_name);
+  for(size_t i = 0 ; i < size1; i++){
+    expected_res << expected_output[i] << std::endl;
+    npu_res << real_output[i] << std::endl;
+  }
+  expected_res.close();
+  npu_res.close();
+
+  std::pair<COMPARE_RESULT, std::string> res = std::make_pair(COMPARE_RESULT::SUCCESS, "");
+  double max_diff = 0;
+  size_t diff_count = 0;
+  for (size_t di = 0; di != size1; ++di) {
+    const double real_value =
+        post_processing ? std::max<double>(0.0, std::min<double>(255.0, real_output[di])) : real_output[di];
+    const double diff = std::fabs(expected_output[di] - real_value);
+    const double tol = per_sample_tolerance + relative_per_sample_tolerance * std::fabs(expected_output[di]);
+    if (!IsResultCloselyMatch<double>(real_value, expected_output[di], diff, tol)) {
+      res.first = COMPARE_RESULT::RESULT_DIFFERS;
+      // update error message if this is a larger diff
+      if (diff > max_diff || (std::isnan(diff) && !std::isnan(max_diff))) {
+        int64_t expected_int = 0;
+        int64_t real_int = 0;
+        memcpy(&expected_int, &expected_output[di], sizeof(FLOAT_TYPE));
+        memcpy(&real_int, &real_output[di], sizeof(FLOAT_TYPE));
+
+        std::ostringstream oss;
+        oss << std::hex << "expected " << expected_output[di] << " (" << expected_int << "), got " << real_value << " ("
+            << real_int << ")"
+            << ", diff: " << diff << ", tol=" << tol << std::dec << " idx=" << di << ".";
+        res.second = oss.str();
+        max_diff = diff;
+      }
+      ++diff_count;
+    }
+  }
+
+  if (res.first == COMPARE_RESULT::SUCCESS) return res;
+
+  std::ostringstream oss;
+  oss << res.second << " " << diff_count << " of " << size1 << " differ";
+  res.second = oss.str();
+  return res;
+}
+
+
+ template <typename T>
+-std::pair<COMPARE_RESULT, std::string> IsResultExactlyMatch(const Tensor& outvalue, const Tensor& expected_value) {
+std::pair<COMPARE_RESULT, std::string> IsResultExactlyMatch(const Tensor& outvalue, const Tensor& expected_value, int32_t out_idx) {
+   const size_t size1 = static_cast<size_t>(expected_value.Shape().Size());
+   const T* expected_output = expected_value.Data<T>();
+   const T* real_output = outvalue.Data<T>();
+  std::string expected_name = "expected_res"+ std::to_string(out_idx)+ ".txt";
+  std::string npures_name = "npu_res"+ std::to_string(out_idx)+ ".txt";
+  std::ofstream expected_res(expected_name), npu_res(npures_name);
+  for(size_t i = 0 ; i < size1; i++){
+    expected_res << expected_output[i] << std::endl;
+    npu_res << real_output[i] << std::endl;
+  }
+  expected_res.close();
+  npu_res.close();
+   for (size_t di = 0; di != size1; ++di) {
+     if (expected_output[di] != real_output[di]) {
+       std::ostringstream oss;
+@@ -201,7 +265,7 @@ std::pair<COMPARE_RESULT, std::string> CompareBFloat16Result(const Tensor& outva
+ 
+ std::pair<COMPARE_RESULT, std::string> CompareTwoTensors(const Tensor& outvalue, const Tensor& expected_tensor,
+                                                          double per_sample_tolerance,
+-                                                         double relative_per_sample_tolerance, bool post_processing) {
+                                                         double relative_per_sample_tolerance, bool post_processing, int32_t out_idx) {
+   if (expected_tensor.Shape() != outvalue.Shape()) {
+     std::ostringstream oss;
+     oss << "shape mismatch, expect " << expected_tensor.Shape().ToString() << " got " << outvalue.Shape().ToString();
+@@ -209,30 +273,30 @@ std::pair<COMPARE_RESULT, std::string> CompareTwoTensors(const Tensor& outvalue,
+   }
+   if (outvalue.IsDataType<float>()) {
+     return CompareFloatResult<float>(outvalue, expected_tensor, per_sample_tolerance, relative_per_sample_tolerance,
+-                                     post_processing);
+                                     post_processing, out_idx);
+   } else if (outvalue.IsDataType<double>()) {
+     return CompareFloatResult<double>(outvalue, expected_tensor, per_sample_tolerance, relative_per_sample_tolerance,
+-                                      post_processing);
+                                      post_processing, out_idx);
+   } else if (outvalue.IsDataTypeString()) {
+-    return IsResultExactlyMatch<std::string>(outvalue, expected_tensor);
+    return IsResultExactlyMatch<std::string>(outvalue, expected_tensor, out_idx);
+   } else if (outvalue.IsDataType<uint8_t>()) {
+-    return IsResultExactlyMatch<uint8_t>(outvalue, expected_tensor);
+    return IsResultExactlyMatch<uint8_t>(outvalue, expected_tensor, out_idx);
+   } else if (outvalue.IsDataType<int8_t>()) {
+-    return IsResultExactlyMatch<int8_t>(outvalue, expected_tensor);
+    return IsResultExactlyMatch<int8_t>(outvalue, expected_tensor, out_idx);
+   } else if (outvalue.IsDataType<uint16_t>()) {
+-    return IsResultExactlyMatch<uint16_t>(outvalue, expected_tensor);
+    return IsResultExactlyMatch<uint16_t>(outvalue, expected_tensor, out_idx);
+   } else if (outvalue.IsDataType<int16_t>()) {
+-    return IsResultExactlyMatch<int16_t>(outvalue, expected_tensor);
+    return IsResultExactlyMatch<int16_t>(outvalue, expected_tensor, out_idx);
+   } else if (outvalue.IsDataType<uint32_t>()) {
+-    return IsResultExactlyMatch<uint32_t>(outvalue, expected_tensor);
+    return IsResultExactlyMatch<uint32_t>(outvalue, expected_tensor, out_idx);
+   } else if (outvalue.IsDataType<int32_t>()) {
+-    return IsResultExactlyMatch<int32_t>(outvalue, expected_tensor);
+    return IsResultExactlyMatch<int32_t>(outvalue, expected_tensor, out_idx);
+   } else if (outvalue.IsDataType<uint64_t>()) {
+-    return IsResultExactlyMatch<uint64_t>(outvalue, expected_tensor);
+    return IsResultExactlyMatch<uint64_t>(outvalue, expected_tensor, out_idx);
+   } else if (outvalue.IsDataType<int64_t>()) {
+-    return IsResultExactlyMatch<int64_t>(outvalue, expected_tensor);
+    return IsResultExactlyMatch<int64_t>(outvalue, expected_tensor, out_idx);
+   } else if (outvalue.IsDataType<bool>()) {
+-    return IsResultExactlyMatch<bool>(outvalue, expected_tensor);
+    return IsResultExactlyMatch<bool>(outvalue, expected_tensor, out_idx);
+   } else if (outvalue.IsDataType<MLFloat16>()) {
+     return CompareFloat16Result(outvalue, expected_tensor, per_sample_tolerance, relative_per_sample_tolerance,
+                                 post_processing);
+@@ -300,7 +364,7 @@ std::pair<COMPARE_RESULT, std::string> CompareSparseTensors(const SparseTensor&
+                      " actual: ", actual.Format());
+ 
+   TEST_RETURN_IF_ERROR(CompareTwoTensors(actual.Values(), expected.Values(),
+-                                         per_sample_tolerance, relative_per_sample_tolerance, post_processing),
+                                         per_sample_tolerance, relative_per_sample_tolerance, post_processing, 0),
+                        "While comparing sparse values");
+ 
+   if (actual.Format() == SparseFormat::kCoo) {
+@@ -308,16 +372,16 @@ std::pair<COMPARE_RESULT, std::string> CompareSparseTensors(const SparseTensor&
+     auto expected_view = expected.AsCoo();
+ 
+     TEST_RETURN_IF_ERROR(CompareTwoTensors(actual_view.Indices(), expected_view.Indices(),
+-                                           per_sample_tolerance, relative_per_sample_tolerance, post_processing),
+                                           per_sample_tolerance, relative_per_sample_tolerance, post_processing, 0),
+                          "Comparing COO indices");
+   } else if (actual.Format() == SparseFormat::kCsrc) {
+     auto actual_view = actual.AsCsr();
+     auto expected_view = expected.AsCsr();
+     TEST_RETURN_IF_ERROR(CompareTwoTensors(actual_view.Inner(), expected_view.Inner(),
+-                                           per_sample_tolerance, relative_per_sample_tolerance, post_processing),
+                                           per_sample_tolerance, relative_per_sample_tolerance, post_processing, 0),
+                          "Comparing Csr(c) inner indices");
+     TEST_RETURN_IF_ERROR(CompareTwoTensors(actual_view.Outer(), expected_view.Outer(),
+-                                           per_sample_tolerance, relative_per_sample_tolerance, post_processing),
+                                           per_sample_tolerance, relative_per_sample_tolerance, post_processing, 0),
+                          "Comparing Csr(c) outer indices");
+   }
+ 
+@@ -385,7 +449,83 @@ std::pair<COMPARE_RESULT, std::string> CompareOrtValue(const OrtValue& o, const
+       return std::make_pair(COMPARE_RESULT::TYPE_MISMATCH, oss.str());
+     }
+     return CompareTwoTensors(outvalue, expected_tensor, per_sample_tolerance, relative_per_sample_tolerance,
+-                             post_processing);
+                             post_processing, 0);
+  } else if (o.IsSparseTensor()) {
+#if !defined(DISABLE_SPARSE_TENSORS)
+    TEST_RETURN_IF_NOT(expected_mlvalue.IsSparseTensor(), COMPARE_RESULT::TYPE_MISMATCH,
+                       "SparseTensor is not expected as output");
+    TEST_RETURN_IF_ERROR(CompareSparseTensors(o.Get<SparseTensor>(), expected_mlvalue.Get<SparseTensor>(),
+                                              per_sample_tolerance, relative_per_sample_tolerance,
+                                              post_processing),
+                         "while comaring sparse tensors");
+#endif
+    return std::make_pair(COMPARE_RESULT::SUCCESS, "");
+  } else if (o.IsTensorSequence()) {
+    auto& expected_tensor_seq = expected_mlvalue.Get<TensorSeq>();
+    auto expected_tensor_count = expected_tensor_seq.Size();
+
+    auto& actual_tensor_seq = o.Get<TensorSeq>();
+    auto actual_tensor_count = actual_tensor_seq.Size();
+
+    if (expected_tensor_count != actual_tensor_count) {
+      std::ostringstream oss;
+      oss << "expected tensor count in the sequence: " << expected_tensor_count << " got "
+          << actual_tensor_count;
+      return std::make_pair(COMPARE_RESULT::RESULT_DIFFERS, oss.str());
+    }
+
+    if (!expected_tensor_seq.IsSameDataType(actual_tensor_seq)) {
+      std::ostringstream oss;
+      oss << "expected tensor type in the sequence: " << expected_tensor_seq.DataType() << " got "
+          << actual_tensor_seq.DataType();
+      return std::make_pair(COMPARE_RESULT::TYPE_MISMATCH, oss.str());
+    }
+
+    for (size_t i = 0; i < expected_tensor_count; ++i) {
+      auto res = CompareTwoTensors(actual_tensor_seq.Get(i), expected_tensor_seq.Get(i), per_sample_tolerance, relative_per_sample_tolerance,
+                                   post_processing,0);
+      if (res.first != COMPARE_RESULT::SUCCESS) {
+        return res;
+      }
+    }
+
+    return std::make_pair(COMPARE_RESULT::SUCCESS, "");
+
+  } else {
+    // Maps
+#if !defined(DISABLE_ML_OPS)
+    if (o.Type() == DataTypeImpl::GetType<VectorMapInt64ToFloat>()) {
+      return CompareSeqOfMapToFloat(o.Get<VectorMapInt64ToFloat>(), expected_mlvalue.Get<VectorMapInt64ToFloat>(),
+                                    per_sample_tolerance, relative_per_sample_tolerance, post_processing);
+    }
+    if (o.Type() == DataTypeImpl::GetType<VectorMapStringToFloat>()) {
+      return CompareSeqOfMapToFloat(o.Get<VectorMapStringToFloat>(), expected_mlvalue.Get<VectorMapStringToFloat>(),
+                                    per_sample_tolerance, relative_per_sample_tolerance, post_processing);
+    }
+    return std::make_pair(COMPARE_RESULT::NOT_SUPPORT, "");
+#else
+    return std::make_pair(COMPARE_RESULT::NOT_SUPPORT, "Map type is not supported in this build.");
+#endif
+  }
+}
+
+std::pair<COMPARE_RESULT, std::string> CompareOrtValue(const OrtValue& o, const OrtValue& expected_mlvalue,
+                                                       double per_sample_tolerance,
+                                                       double relative_per_sample_tolerance, bool post_processing, int32_t out_idx) {
+  if (o.Type() != expected_mlvalue.Type()) {
+    return std::make_pair(COMPARE_RESULT::TYPE_MISMATCH, "");
+  }
+  if (o.IsTensor()) {
+    const Tensor& outvalue = o.Get<Tensor>();
+    const Tensor& expected_tensor = expected_mlvalue.Get<Tensor>();
+    if (outvalue.DataType() != expected_tensor.DataType()) {
+      std::ostringstream oss;
+      oss << "expect " << ElementTypeToString(expected_tensor.DataType()) << " got "
+          << ElementTypeToString(outvalue.DataType());
+      return std::make_pair(COMPARE_RESULT::TYPE_MISMATCH, oss.str());
+    }
+    return CompareTwoTensors(outvalue, expected_tensor, per_sample_tolerance, relative_per_sample_tolerance,
+                             post_processing, out_idx);
+   } else if (o.IsSparseTensor()) {
+ #if !defined(DISABLE_SPARSE_TENSORS)
+     TEST_RETURN_IF_NOT(expected_mlvalue.IsSparseTensor(), COMPARE_RESULT::TYPE_MISMATCH,
+@@ -419,7 +559,7 @@ std::pair<COMPARE_RESULT, std::string> CompareOrtValue(const OrtValue& o, const
+ 
+     for (size_t i = 0; i < expected_tensor_count; ++i) {
+       auto res = CompareTwoTensors(actual_tensor_seq.Get(i), expected_tensor_seq.Get(i), per_sample_tolerance, relative_per_sample_tolerance,
+-                                   post_processing);
+                                   post_processing, out_idx);
+       if (res.first != COMPARE_RESULT::SUCCESS) {
+         return res;
+       }
+diff --git a/onnxruntime/test/util/include/compare_ortvalue.h b/onnxruntime/test/util/include/compare_ortvalue.h
+index 24b74b9002..8269346528 100644
+--- a/onnxruntime/test/util/include/compare_ortvalue.h
+++ b/onnxruntime/test/util/include/compare_ortvalue.h
+@@ -24,7 +24,9 @@ enum class COMPARE_RESULT { SUCCESS,
+ std::pair<COMPARE_RESULT, std::string> CompareOrtValue(const OrtValue& real, const OrtValue& expected,
+                                                        double per_sample_tolerance,
+                                                        double relative_per_sample_tolerance, bool post_processing);
+-
+std::pair<COMPARE_RESULT, std::string> CompareOrtValue(const OrtValue& real, const OrtValue& expected,
+                                                       double per_sample_tolerance,
+                                                       double relative_per_sample_tolerance, bool post_processing, int32_t out_idx);
+ // verify if the 'value' matches the 'expected' ValueInfoProto. 'value' is a model output
+ std::pair<COMPARE_RESULT, std::string> VerifyValueInfo(const ONNX_NAMESPACE::ValueInfoProto& expected,
+                                                        const OrtValue* value);
+diff --git a/onnxruntime/test/util/include/test/compare_ortvalue.h b/onnxruntime/test/util/include/test/compare_ortvalue.h
+index 545df706c9..170eb9dc4c 100644
+--- a/onnxruntime/test/util/include/test/compare_ortvalue.h
+++ b/onnxruntime/test/util/include/test/compare_ortvalue.h
+@@ -28,7 +28,9 @@ enum class COMPARE_RESULT {
+ std::pair<COMPARE_RESULT, std::string> CompareOrtValue(const OrtValue& real, const OrtValue& expected,
+                                                        double per_sample_tolerance,
+                                                        double relative_per_sample_tolerance, bool post_processing);
+-
+std::pair<COMPARE_RESULT, std::string> CompareOrtValue(const OrtValue& real, const OrtValue& expected,
+                                                       double per_sample_tolerance,
+                                                       double relative_per_sample_tolerance, bool post_processing, int32_t out_idx);
+ // Compare two OrtValue numerically equal or not. The difference with CompareOrtValue is that this function
+ // will only check the numerical values of the OrtValue, and ignore the type, shape, etc.
+ //
--- a/onnxruntime/core/providers/vsinpu/patches/mlas_crosscompiling.patch
+++ b/onnxruntime/core/providers/vsinpu/patches/mlas_crosscompiling.patch
@ -0,0 +1,34 @@
+diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
+index e0ccc504d7..6c5aa6ea53 100644
+--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
+@@ -335,7 +335,7 @@ else()
+           ${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp
+           ${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
+         )
+-        if (NOT APPLE)
+        if (NOT APPLE AND NOT onnxruntime_USE_VSINPU)
+           set(mlas_platform_srcs
+             ${mlas_platform_srcs}
+             ${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S
+diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
+index fd6b3df934..f81f1c42b6 100644
+--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
+@@ -79,6 +79,7 @@ Abstract:
+
+ #if (!defined(_MSC_VER)) || (_MSC_VER >= 1930)
+ #if defined(MLAS_TARGET_ARM64) || defined(MLAS_TARGET_ARM64EC)
+#if !defined(USE_VSINPU)
+ #if !defined(__APPLE__)
+ // Had to temporary disable fp16 under APPLE ARM64, as compiling
+ // the source files require a hardware specific compilation flag.
+@@ -87,7 +88,8 @@ Abstract:
+
+ #define MLAS_F16VEC_INTRINSICS_SUPPORTED
+
+-#endif //
+#endif
+#endif //
+ #endif // ARM64
+ #endif // Visual Studio 16 or earlier does not support fp16 intrinsic
--- a/onnxruntime/core/providers/vsinpu/patches/test_scripts/compare_cosine_sim.py
+++ b/onnxruntime/core/providers/vsinpu/patches/test_scripts/compare_cosine_sim.py
@ -0,0 +1,29 @@
+import sys
+
+import numpy as np
+from numpy.linalg import norm
+
+
+def read_values(filename):
+    with open(filename) as file:
+        values = np.array([float(line.strip()) for line in file])
+    return values
+
+
+def cosine_similarity(vec1, vec2):
+    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python cosine_similarity.py <file1> <file2>")
+        sys.exit(1)
+
+    file1 = sys.argv[1]
+    file2 = sys.argv[2]
+
+    vec1 = read_values(file1)
+    vec2 = read_values(file2)
+
+    similarity = cosine_similarity(vec1, vec2)
+    print(f"Cosine Similarity: {similarity}")
--- a/onnxruntime/core/providers/vsinpu/patches/test_scripts/compare_topn.py
+++ b/onnxruntime/core/providers/vsinpu/patches/test_scripts/compare_topn.py
@ -0,0 +1,34 @@
+import sys
+
+
+def read_values(filename):
+    with open(filename) as file:
+        values = [(float(line.strip()), i + 1) for i, line in enumerate(file)]
+    return values
+
+
+def top_n(values, N):
+    return sorted(values, key=lambda x: x[0], reverse=True)[:N]
+
+
+def compare_files(cpu_file, npu_file, N):
+    cpu_values = read_values(cpu_file)
+    npu_values = read_values(npu_file)
+
+    cpu_topn = top_n(cpu_values, N)
+    npu_topn = top_n(npu_values, N)
+
+    print(f"Top-{N} values in {cpu_file}: {cpu_topn}")
+    print(f"Top-{N} values in {npu_file}: {npu_topn}")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print("Usage: python compare_topn.py <N> <cpu_file> <npu_file>")
+        sys.exit(1)
+
+    N = int(sys.argv[1])
+    cpu_file = sys.argv[2]
+    npu_file = sys.argv[3]
+
+    compare_files(cpu_file, npu_file, N)
--- a/onnxruntime/core/providers/vsinpu/patches/test_scripts/result_compare.sh
+++ b/onnxruntime/core/providers/vsinpu/patches/test_scripts/result_compare.sh
@ -0,0 +1,23 @@
+#!/bin/bash
+res_file_dir=$1
+output_num=$2
+
+# specifying N value
+N=5
+
+for i in $(seq 0 $((output_num-1)));
+do
+  # 构建文件名
+  golden_file="${res_file_dir}/expected_res${i}.txt"
+  npu_file="${res_file_dir}/npu_res${i}.txt"
+
+  echo "Comparing Top-${N} for the output_${i}"
+  python3 compare_topn.py $N $golden_file $npu_file
+
+  echo "--------------------------------"
+
+  echo "Comparing Cosine Similarity for output_${i}:"
+  python3 compare_cosine_sim.py $golden_file $npu_file
+
+  echo ""
+done
--- a/onnxruntime/core/providers/vsinpu/symbols.txt
+++ b/onnxruntime/core/providers/vsinpu/symbols.txt
@ -0,0 +1 @@
+OrtSessionOptionsAppendExecutionProvider_VSINPU
--- a/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.cc
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.cc
@ -0,0 +1,296 @@
+
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <algorithm>
+#include "core/providers/vsinpu/vsinpu_ep_graph.h"
+#include "core/providers/vsinpu/builders/op_builder_factory.h"
+#include "core/providers/vsinpu/vsinpu_util.h"
+#include "core/framework/node_unit.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+
+namespace onnxruntime {
+
+namespace vsi {
+namespace npu {
+GraphEP::GraphEP(const onnxruntime::GraphViewer& graph_viewer) : graph_viewer_(graph_viewer) {
+  Prepare();
+  context_ = tim::vx::Context::Create();
+  graph_ = context_->CreateGraph();
+  compiled_ = false;
+}
+
+bool GraphEP::Prepare() {
+  std::tie(node_unit_holder_, node_unit_map_) = QDQ::GetAllNodeUnits(graph_viewer_);
+  for (const auto& node_unit : node_unit_holder_) {
+    auto quant_op_type = util::GetQuantizedOpType(*node_unit);
+
+    // Not a qlinear op or qdq node group
+    if (quant_op_type == util::QuantizedOpType::Unknown)
+      continue;
+
+    const auto add_quantized_input =
+        [&all_quantized_op_inputs = all_quantized_op_inputs_](const NodeUnit& node_unit, size_t input_idx) {
+          const auto& input_name = node_unit.Inputs()[input_idx].node_arg.Name();
+          all_quantized_op_inputs[input_name].push_back(&node_unit);
+        };
+
+    // All quantized ops EXCEPT QuantizeLinear has quantized input
+    if (quant_op_type != util::QuantizedOpType::QuantizeLinear) {
+      add_quantized_input(*node_unit, 0);
+    }
+
+    if (util::IsQuantizedBinaryOp(quant_op_type)) {
+      add_quantized_input(*node_unit, 1);
+      if (util::IsQuantizedConv(quant_op_type) && node_unit->Inputs().size() == 3) {
+        add_quantized_input(*node_unit, 2);
+      }
+    }
+  }  // All quantized inputs is recorded
+  return true;
+}
+
+bool GraphEP::SupportedOp(const onnxruntime::GraphViewer& graph_viewer,
+                          const NodeUnit& node_unit) {
+  const auto& supported_builtins = vsi::npu::SupportedBuiltinOps();
+  const auto& target_node = node_unit.GetNode();
+  const auto& it = supported_builtins.find(target_node.OpType());
+  if (supported_builtins.end() != it) {
+    return it->second->IsSupported(graph_viewer, node_unit);
+  }
+  LOGS_DEFAULT(WARNING) << "Fallback unsupported op (node_unit) " << node_unit.OpType()
+                        << "  to cpu.";
+  return false;
+}
+
+bool GraphEP::IsNodeSupportedInGroup(const NodeUnit& node_unit, const GraphViewer& graph_viewer) {
+  return SupportedOp(graph_viewer, node_unit);
+}
+
+const NodeUnit& GraphEP::GetNodeUnit(const Node* node) const {
+  const auto node_unit_it = node_unit_map_.find(node);
+  ORT_ENFORCE(node_unit_it != node_unit_map_.end(), "Node does not have corresponding NodeUnit.");
+  return *node_unit_it->second;
+}
+
+void GraphEP::UpdateTensorMap(const std::string& name, const std::shared_ptr<tim::vx::Tensor>& dst_tensor) {
+  auto it = tensors_.find(name);
+  if (it != tensors_.end()) {
+    it->second = dst_tensor;
+  }
+  for (auto& IO : graph_inputs_) {
+    if (IO->name == name) {
+      IO->tensor = dst_tensor;
+      break;
+    }
+  }
+  for (auto& IO : graph_outputs_) {
+    if (IO->name == name) {
+      IO->tensor = dst_tensor;
+      break;
+    }
+  }
+}
+
+std::shared_ptr<NodeIOInfo> GraphEP::ConstructNodeIO(const std::shared_ptr<tim::vx::Operation>& op, std::vector<NodeArg*> input_arg, std::vector<NodeArg*> output_arg) {
+  auto info = std::make_shared<vsi::npu::NodeIOInfo>();
+  info->op_ = op;
+  std::vector<std::string> input_names, output_names;
+  if (input_arg.empty()) {
+    info->input_names_ = std::vector<std::string>();
+  } else {
+    input_names.reserve(input_arg.size());
+    std::transform(input_arg.begin(), input_arg.end(), std::back_inserter(input_names),
+                   [](const NodeArg* node) -> std::string {
+                     return node->Name();
+                   });
+    info->input_names_ = input_names;
+  }
+  if (output_arg.empty()) {
+    info->output_names_ = std::vector<std::string>();
+  } else {
+    output_names.reserve(output_arg.size());
+    std::transform(output_arg.begin(), output_arg.end(), std::back_inserter(output_names),
+                   [](const NodeArg* node) -> std::string {
+                     return node->Name();
+                   });
+    info->output_names_ = output_names;
+  }
+
+  return info;
+}
+
+bool GraphEP::BindTensors(const std::shared_ptr<NodeIOInfo>& nodeio_info) {
+  auto op = nodeio_info->op_;
+  auto input_names = nodeio_info->input_names_;
+  auto output_names = nodeio_info->output_names_;
+  if (!input_names.empty()) {
+    for (auto& name : input_names) {
+      if (tensors_.find(name) == tensors_.end() || tensors_[name] == nullptr) {
+        LOGS_DEFAULT(ERROR) << "Input tensor not defined or not found!";
+        return false;
+      }
+      (*op).BindInput(tensors_[name]);
+    }
+  }
+  if (!output_names.empty()) {
+    for (auto& name : output_names) {
+      if (tensors_.find(name) == tensors_.end() || tensors_[name] == nullptr) {
+        LOGS_DEFAULT(ERROR) << "Output tensor not defined or not found!";
+        return false;
+      }
+      (*op).BindOutput(tensors_[name]);
+    }
+  }
+  return true;
+}
+
+std::shared_ptr<tim::vx::Tensor> GraphEP::MapTIMVXTensor(
+    std::shared_ptr<tim::vx::Graph>& graph, const NodeUnitIODef nudef,
+    const NodeUnit& node_unit,
+    const GraphViewer* graph_viewer, tim::vx::TensorAttribute attribute) {
+  const auto& arg = nudef.node_arg;
+
+  if (tensors_.end() != tensors_.find(nudef.node_arg.Name())) {
+    // if (!quant_param.has_value() || quant_param.has_value() && tensors_[arg.Name()]->GetSpec().GetQuantization().Type() != tim::vx::QuantType::NONE)
+    return tensors_.find(arg.Name())->second;
+  }
+  auto shape = vsi::npu::util::OnnxShapeToTIMVXShape(vsi::npu::util::GetTensorShape(arg));
+  std::reverse(shape.begin(), shape.end());
+  tim::vx::DataType dt = vsi::npu::util::OnnxDtypeToTIMVXDtype(arg.Type());
+  tim::vx::TensorSpec spec = tim::vx::TensorSpec(dt, shape, attribute);
+
+  // Tensors have same name may not have same status of quant_param existence, such as QLinearConv->MaxPool->QLinearConv
+  // Maxpool output tensor is not set quantization at first pass
+  bool is_qtensor = nudef.quant_param.has_value() || Contains(all_quantized_op_inputs_, arg.Name());
+  if (is_qtensor) {
+    float scale = 0.0f;
+    int32_t zp = 0;
+    std::optional<std::vector<float>> scales;
+    std::optional<std::vector<int32_t>> zps;
+    if (nudef.quant_param.has_value()) {
+      util::GetQuantizationScaleAndZeroPoint(graph_viewer_.GetAllInitializedTensors(),
+                                             nudef, node_unit.ModelPath(),
+                                             scale, zp, scales, zps);
+    } else {
+      auto target_nodeunit = all_quantized_op_inputs_[arg.Name()][0];
+      auto qinput = all_quantized_op_inputs_[arg.Name()][0]->Inputs();
+      auto it = std::find_if(qinput.begin(), qinput.end(), [&arg](const NodeUnitIODef& nud) { return nud.node_arg.Name() == arg.Name(); });
+      bool is_conv_bias = std::distance(qinput.begin(), it) == 2;
+      if (!is_conv_bias || it->quant_param.has_value()) {
+        util::GetQuantizationScaleAndZeroPoint(graph_viewer_.GetAllInitializedTensors(),
+                                               *it, target_nodeunit->ModelPath(),
+                                               scale, zp, scales, zps);
+      } else if (!it->quant_param.has_value()) {
+        float in_scale, w_scale;
+        int32_t in_zp, w_zp;
+        std::optional<std::vector<float>> in_scales, w_scales;
+        std::optional<std::vector<int32_t>> in_zps, w_zps;
+
+        // onnx defines conv bias with non quantization, but it must be quantized in VSINPU support
+        // The bias scale is set as input_scale * weight_scale if per layer quantized, input_scale* weight_scale[i] if per channel quantized
+        util::GetQuantizationScaleAndZeroPoint(graph_viewer_.GetAllInitializedTensors(),
+                                               qinput[0], target_nodeunit->ModelPath(),
+                                               in_scale, in_zp, in_scales, in_zps);
+        util::GetQuantizationScaleAndZeroPoint(graph_viewer_.GetAllInitializedTensors(),
+                                               qinput[1], target_nodeunit->ModelPath(),
+                                               w_scale, w_zp, w_scales, w_zps);
+        scale = in_scale * w_scale;
+        zp = 0;
+        if (w_scales) {
+          std::vector<float> temp;
+          for (size_t i = 0; i < w_scales->size(); i++) {
+            temp.push_back(w_scales.value()[i] * in_scale);
+          }
+          scales = temp;
+        }
+      }
+    }
+    tim::vx::Quantization quant;
+    // per tensor quantization
+    if (!scales.has_value()) {
+      quant.SetType(tim::vx::QuantType::ASYMMETRIC);
+      quant.SetScales({scale});
+      quant.SetZeroPoints({zp});
+    } else {  // per channel quantization
+      if (zps.has_value()) {
+        bool has_nonzero = std::find_if(zps->begin(), zps->end(), [](int elem) { return elem != 0; }) != zps->end();
+        if (has_nonzero && *arg.Type() == "tensor(uint8)") {
+          quant.SetType(tim::vx::QuantType::ASYMMETRIC_PER_CHANNEL);
+        } else {
+          quant.SetType(tim::vx::QuantType::SYMMETRIC_PER_CHANNEL);
+        }
+        quant.SetZeroPoints(zps.value());
+      } else {
+        if (*arg.Type() == "tensor(int32)" || zp == 0) {
+          // set bias quant type
+          quant.SetType(tim::vx::QuantType::SYMMETRIC_PER_CHANNEL);
+        } else {
+          quant.SetType(tim::vx::QuantType::ASYMMETRIC_PER_CHANNEL);
+        }
+        quant.SetZeroPoints({zp});
+      }
+      quant.SetScales(scales.value());
+      quant.SetChannelDim(shape.size() - 1);
+    }
+    spec.SetQuantization(quant);
+  }
+
+  std::shared_ptr<tim::vx::Tensor> tensor;
+  if (attribute ==
+      tim::vx::TensorAttribute::CONSTANT) {  // create const tensor
+    const ONNX_NAMESPACE::TensorProto* tensor_proto =
+        graph_viewer_.GetConstantInitializer(arg.Name(), true);
+    std::shared_ptr<uint8_t> unpackedTensor =
+        vsi::npu::util::UnpackTensor(&arg, *tensor_proto);
+
+    const void* valueAddr =
+        reinterpret_cast<const void*>(unpackedTensor.get());
+    tensor = graph->CreateTensor(spec, valueAddr);
+
+  } else {
+    tensor = graph->CreateTensor(spec);
+  }
+  for (auto& input : graph_inputs_) {
+    if (input->name == arg.Name()) {
+      input->tensor = tensor;
+      input->shape = vsi::npu::util::GetTensorShape(arg);
+      break;
+    }
+  }
+  for (auto& output : graph_outputs_) {
+    if (output->name == arg.Name()) {
+      output->tensor = tensor;
+      output->shape = utils::GetTensorShapeFromTensorShapeProto(*arg.Shape());
+      break;
+    }
+  }
+  tensors_.insert({arg.Name(), tensor});
+  return tensor;
+}
+
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.h
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.h
@ -0,0 +1,116 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+
+#pragma once
+#include <map>
+#include <vector>
+#include <string>
+#include <memory>
+#include <unordered_map>
+#include "builders/op_builder.h"
+#include "tim/vx/context.h"
+#include "tim/vx/graph.h"
+#include "tim/vx/tensor.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+struct GraphIOInfo {
+  std::string name;
+  bool is_initializer;
+  std::shared_ptr<tim::vx::Tensor> tensor;
+  TensorShape shape;
+};
+
+struct NodeIOInfo {
+  std::shared_ptr<tim::vx::Operation> op_;
+  std::vector<std::string> input_names_;
+  std::vector<std::string> output_names_;
+};
+
+class GraphEP {
+ public:
+  explicit GraphEP(const GraphViewer& graph_viewer);
+  ~GraphEP() {}
+
+  bool Prepare();
+
+  static bool SupportedOp(const onnxruntime::GraphViewer& graph_viewer,
+                          const NodeUnit& node_unit);
+
+  // If a node is supported by VSINPU in a partition node group
+  // `node_outputs_in_group` is the set of the output names of the nodes added to this group so far
+  static bool IsNodeSupportedInGroup(const NodeUnit& node_unit, const GraphViewer& graph_viewer);
+
+  const NodeUnit& GetNodeUnit(const Node* node) const;
+
+  bool& GetCompiled() { return compiled_; }
+  std::shared_ptr<tim::vx::Graph>& GetGraph() { return graph_; }
+  std::vector<std::shared_ptr<tim::vx::Operation>>& GetOps() { return ops_; }
+  std::map<std::string, std::shared_ptr<tim::vx::Tensor>>& GetTensors() {
+    return tensors_;
+  }
+
+  std::vector<std::shared_ptr<GraphIOInfo>>& GetGraphInputs() {
+    return graph_inputs_;
+  }
+
+  std::vector<std::shared_ptr<GraphIOInfo>>& GetGraphOutputs() {
+    return graph_outputs_;
+  }
+
+  void UpdateTensorMap(const std::string& name, const std::shared_ptr<tim::vx::Tensor>& dst_tensor);
+
+  std::shared_ptr<NodeIOInfo> ConstructNodeIO(const std::shared_ptr<tim::vx::Operation>& op, std::vector<NodeArg*> input_arg, std::vector<NodeArg*> output_arg);
+
+  bool BindTensors(const std::shared_ptr<NodeIOInfo>& nodeio_info);
+
+  std::shared_ptr<tim::vx::Tensor> MapTIMVXTensor(
+      std::shared_ptr<tim::vx::Graph>& graph, const NodeUnitIODef nudef,
+      const NodeUnit& nodeunit,
+      const GraphViewer* graph_viewer, tim::vx::TensorAttribute attribute);
+
+ private:
+  std::shared_ptr<tim::vx::Context> context_;
+  std::shared_ptr<tim::vx::Graph> graph_;
+  std::map<std::string, std::shared_ptr<tim::vx::Tensor>> tensors_;
+  std::vector<std::shared_ptr<tim::vx::Operation>> ops_;
+  std::vector<std::shared_ptr<GraphIOInfo>> graph_inputs_;
+  std::vector<std::shared_ptr<GraphIOInfo>> graph_outputs_;
+
+  // Contains all quantized operators' input and the NodeUnit(s) using the input
+  // In the form of {input_name, [NodeUnit(s) using the input]}
+  std::unordered_map<std::string, std::vector<const NodeUnit*>> all_quantized_op_inputs_;
+  const GraphViewer& graph_viewer_;
+
+  // Holder for the NodeUnits in the graph, this will guarantee the NodeUnits is
+  // valid throughout the lifetime of the ModelBuilder
+  std::vector<std::unique_ptr<NodeUnit>> node_unit_holder_;
+  std::unordered_map<const Node*, const NodeUnit*> node_unit_map_;
+  bool compiled_;
+};
+}  // namespace npu
+
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc
@ -0,0 +1,277 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include <unordered_map>
+#include <string>
+#include <unordered_set>
+#include "core/framework/compute_capability.h"
+#include "core/providers/vsinpu/vsinpu_execution_provider.h"
+#include "core/providers/vsinpu/vsinpu_ep_graph.h"
+#include "core/providers/vsinpu/builders/op_builder.h"
+#include "core/providers/vsinpu/builders/op_builder_factory.h"
+#include "core/providers/vsinpu/vsinpu_util.h"
+#include "core/framework/kernel_registry.h"
+#include "core/framework/node_unit.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/providers/partitioning_utils.h"
+
+namespace onnxruntime {
+VSINPUExecutionProvider::VSINPUExecutionProvider(const VSINPUExecutionProviderInfo& info)
+    : IExecutionProvider{onnxruntime::kVSINPUExecutionProvider},
+      device_id_(info.device_id) {
+  AllocatorCreationInfo default_memory_info{
+      [](int) {
+        return std::make_unique<CPUAllocator>(
+            OrtMemoryInfo("VSINPU", OrtAllocatorType::OrtDeviceAllocator));
+      }};
+
+  CreateAllocator(default_memory_info);
+
+  AllocatorCreationInfo cpu_memory_info{
+      [](int) {
+        return std::make_unique<CPUAllocator>(
+            OrtMemoryInfo("VSINPU", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), 0, OrtMemTypeCPUOutput));
+      }};
+
+  CreateAllocator(cpu_memory_info);
+}
+
+VSINPUExecutionProvider::~VSINPUExecutionProvider() {}
+
+std::vector<std::unique_ptr<ComputeCapability>>
+VSINPUExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
+                                       const IKernelLookup& /*kernel_lookup*/) const {
+  std::vector<std::unique_ptr<ComputeCapability>> result;
+
+  if (graph_viewer.IsSubgraph()) {
+    return result;
+  }
+
+  for (const auto& tensor : graph_viewer.GetAllInitializedTensors()) {
+    if (tensor.second->has_data_location()) {
+      LOGS_DEFAULT(VERBOSE) << "location:" << tensor.second->data_location();
+      if (tensor.second->data_location() ==
+          ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) {
+        LOGS_DEFAULT(WARNING) << "VSINPU: Initializers with external data location are not "
+                                 "currently supported";
+        return result;
+      }
+    }
+  }
+  // Get all the NodeUnits in the graph_viewer
+  std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
+  std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer);
+
+  // This holds the result of whether a NodeUnit is supported or not,
+  // to prevent nodes in a NodeUnit to be checked for multiple times
+  std::unordered_map<const NodeUnit*, bool> node_unit_supported_result;
+  node_unit_supported_result.reserve(node_unit_holder.size());
+  std::unordered_set<std::string> node_outputs_in_current_group{};
+
+  const auto is_node_supported = [&](const Node& node) -> bool {
+    const NodeUnit* node_unit = node_unit_map.at(&node);
+    bool supported = false;
+
+    // If we have visited one of the nodes in the node_unit, use the result directly
+    const auto it = node_unit_supported_result.find(node_unit);
+    if (it != node_unit_supported_result.cend()) {
+      supported = it->second;
+    } else {
+      // We only check the target node of the node unit
+      supported = vsi::npu::GraphEP::IsNodeSupportedInGroup(*node_unit, graph_viewer);
+      node_unit_supported_result[node_unit] = supported;
+    }
+
+    LOGS_DEFAULT(VERBOSE) << "Node supported: [" << supported
+                          << "] Operator type: [" << node.OpType()
+                          << "] index: [" << node.Index()
+                          << "] name: [" << node.Name()
+                          << "] as part of the NodeUnit type: [" << node_unit->OpType()
+                          << "] index: [" << node_unit->Index()
+                          << "] name: [" << node_unit->Name()
+                          << "]";
+
+    if (supported) {
+      // We want to save all the output names of nodes in the current group for easy query
+      for (const auto* output : node.OutputDefs()) {
+        node_outputs_in_current_group.insert(output->Name());
+      }
+    }
+    return supported;
+  };
+
+  const auto on_group_closed = [&](const std::vector<const Node*>& group) -> bool {
+    // reset per-partition node group tracking
+    node_outputs_in_current_group.clear();
+    return true;
+  };
+
+  const auto gen_metadef_name = [&]() {
+    static size_t group_counter = 0;
+    return "VSINPU_" + std::to_string(++group_counter);
+  };
+  result = utils::CreateSupportedPartitions(graph_viewer, is_node_supported, on_group_closed,
+                                            gen_metadef_name, "VSINPU", kVSINPUExecutionProvider, &node_unit_map);
+  std::for_each(result.begin(), result.end(), [&graph_viewer](auto& capability) {
+    if (capability && capability->sub_graph && capability->sub_graph->GetMetaDef()) {
+      const auto* meta_def = capability->sub_graph->GetMetaDef();
+      bool has_any_non_constant_inputs = std::any_of(meta_def->inputs.begin(), meta_def->inputs.end(), [&graph_viewer](const auto& input) {
+        return !graph_viewer.IsConstantInitializer(input, true);
+      });
+
+      // ALL inputs are constant
+      if (!has_any_non_constant_inputs) {
+        capability.reset();
+      }
+    }
+  });
+
+  const auto num_of_partitions = result.size();
+  const auto num_of_supported_nodes = std::accumulate(
+      result.begin(), result.end(), size_t{0},
+      [](const auto& acc, const auto& partition) -> size_t {
+        return acc + (partition && partition->sub_graph ? partition->sub_graph->nodes.size() : 0);
+      });
+
+  const auto summary_msg = MakeString(
+      "VSINPUExecutionProvider::GetCapability,",
+      " number of partitions supported by VSINPU: ", num_of_partitions,
+      "; number of nodes in the graph: ", graph_viewer.NumberOfNodes(),
+      "; number of nodes supported by VSINPU: ", num_of_supported_nodes);
+
+  // If the graph is partitioned in multiple subgraphs, and this may impact performance,
+  // we want to give users a summary message at warning level.
+  if (num_of_partitions > 1) {
+    LOGS_DEFAULT(WARNING) << summary_msg;
+  } else {
+    LOGS_DEFAULT(INFO) << summary_msg;
+  }
+
+  return result;
+}
+
+Status ComputeStateFunc(vsi::npu::GraphEP* graph_ep,
+                        OrtKernelContext* context) {
+  Ort::KernelContext ctx(context);
+  size_t num_in = ctx.GetInputCount();
+  const size_t num_inputs = graph_ep->GetGraphInputs().size();
+
+  for (size_t i = 0, j = 0; i < num_inputs; i++) {
+    if (!graph_ep->GetGraphInputs()[i]->is_initializer) {
+      const auto onnx_input_tensor = ctx.GetInput(i);
+      const auto tensor_info = onnx_input_tensor.GetTensorTypeAndShapeInfo();
+
+      auto origin_tensor = graph_ep->GetGraphInputs()[i]->tensor;
+      origin_tensor->CopyDataToTensor(onnx_input_tensor.GetTensorRawData(), vsi::npu::util::GetTensorBytes(tensor_info));
+      j++;
+    }
+  }
+
+  if (!graph_ep->GetGraph()->Run()) {
+    LOGS_DEFAULT(ERROR) << "Failed to run graph.";
+  }
+  for (size_t i = 0; i < ctx.GetOutputCount(); i++) {
+    auto timvx_tensor = graph_ep->GetGraphOutputs()[i]->tensor;
+    auto out_shape = graph_ep->GetGraphOutputs()[i]->shape.GetDims();
+    auto onnx_output_tensor =
+        ctx.GetOutput(i, out_shape.data(), out_shape.size());
+    timvx_tensor->CopyDataFromTensor(const_cast<void*>(onnx_output_tensor.GetTensorRawData()));
+  }
+
+  return Status::OK();
+}
+
+Status VSINPUExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
+                                        std::vector<NodeComputeInfo>& node_compute_funcs) {
+  for (const auto& fused_node_graph : fused_nodes_and_graphs) {
+    const GraphViewer& graph_viewer = fused_node_graph.filtered_graph;
+    std::shared_ptr<vsi::npu::GraphEP> graph_ep = std::make_shared<vsi::npu::GraphEP>(graph_viewer);
+
+    for (auto tensor : graph_viewer.GetInputsIncludingInitializers()) {
+      LOGS_DEFAULT(VERBOSE) << "subgraph input init:" << vsi::npu::util::PrintNode(*tensor) << "#"
+                            << graph_viewer.IsInitializedTensor(tensor->Name());
+      auto input = std::make_shared<vsi::npu::GraphIOInfo>();
+      input->name = tensor->Name();
+      input->is_initializer = graph_viewer.IsConstantInitializer(tensor->Name(), true);
+      graph_ep->GetGraphInputs().push_back(input);
+    }
+    for (auto tensor : graph_viewer.GetOutputs()) {
+      LOGS_DEFAULT(VERBOSE) << "subgraph output:" << vsi::npu::util::PrintNode(*tensor);
+      auto output = std::make_shared<vsi::npu::GraphIOInfo>();
+      output->name = tensor->Name();
+      output->is_initializer = false;
+      graph_ep->GetGraphOutputs().push_back(output);
+    }
+
+    auto node_indices = graph_viewer.GetNodesInTopologicalOrder();
+    for (const auto& node_index : node_indices) {
+      const auto node = graph_viewer.GetNode(node_index);
+      const NodeUnit& node_unit = graph_ep->GetNodeUnit(node);
+
+      // Only add op when we hit the target node
+      if (node != &node_unit.GetNode()) {
+        continue;
+      }
+      LOGS_DEFAULT(VERBOSE) << "Adding node: [" << node->OpType() << "]";
+      vsi::npu::SupportedBuiltinOps().at(node->OpType())->BuildOp(graph_ep.get(), graph_viewer, node_unit);
+    }
+
+    LOGS_DEFAULT(INFO) << "Verifying graph";
+    graph_ep->GetCompiled() = graph_ep->GetGraph()->Compile();
+    if (!graph_ep->GetCompiled()) {
+      LOGS_DEFAULT(ERROR) << "Failed to verify graph.";
+    } else {
+      LOGS_DEFAULT(INFO) << "Graph has been verified successfully.";
+    }
+
+    NodeComputeInfo compute_info;
+    compute_info.create_state_func = [graph_ep](ComputeContext* /*context*/,
+                                                FunctionState* state) {
+      *state = graph_ep.get();
+      return 0;
+    };
+
+    compute_info.compute_func =
+        [graph_ep, this](FunctionState /*state*/, const OrtApi* /* api */,
+                         OrtKernelContext* context) {
+          std::lock_guard<OrtMutex> lock(this->GetMutex());
+          Status res = ComputeStateFunc(graph_ep.get(), context);
+          return res;
+        };
+
+    compute_info.release_state_func = [](FunctionState /*state*/) {};
+
+    node_compute_funcs.push_back(compute_info);
+  }
+
+  return Status::OK();
+}
+
+std::shared_ptr<KernelRegistry> VSINPUExecutionProvider::GetKernelRegistry() const {
+  static std::shared_ptr<KernelRegistry> kernel_registry = std::make_shared<KernelRegistry>();
+  return kernel_registry;
+}
+
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h
@ -0,0 +1,53 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#pragma once
+#include <memory>
+#include <vector>
+#include "core/framework/execution_provider.h"
+#include "core/session/abi_session_options_impl.h"
+
+namespace onnxruntime {
+struct VSINPUExecutionProviderInfo {
+  int device_id{0};
+};
+
+class VSINPUExecutionProvider : public IExecutionProvider {
+ public:
+  explicit VSINPUExecutionProvider(const VSINPUExecutionProviderInfo& info);
+  virtual ~VSINPUExecutionProvider();
+
+  std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
+      const onnxruntime::GraphViewer& graph_viewer,
+      const IKernelLookup& kernel_lookup) const override;
+  std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
+  Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
+                 std::vector<NodeComputeInfo>& node_compute_funcs) override;
+  OrtMutex& GetMutex() { return mutex_; }
+
+ private:
+  int device_id_;
+  OrtMutex mutex_;
+};
+
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/vsinpu_provider_factory.cc
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_provider_factory.cc
@ -0,0 +1,59 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include "core/framework/compute_capability.h"
+#include "core/providers/vsinpu/vsinpu_provider_factory.h"
+#include "core/providers/vsinpu/vsinpu_provider_factory_creator.h"
+#include "core/providers/vsinpu/vsinpu_execution_provider.h"
+
+namespace onnxruntime {
+
+struct VSINPUProviderFactory : IExecutionProviderFactory {
+  VSINPUProviderFactory() {}
+  ~VSINPUProviderFactory() override {}
+
+  std::unique_ptr<IExecutionProvider> CreateProvider() override;
+};
+
+std::unique_ptr<IExecutionProvider> VSINPUProviderFactory::CreateProvider() {
+  onnxruntime::VSINPUExecutionProviderInfo info;
+  return std::make_unique<onnxruntime::VSINPUExecutionProvider>(info);
+}
+
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_VSINPU() {
+  return std::make_shared<onnxruntime::VSINPUProviderFactory>();
+}
+
+std::shared_ptr<IExecutionProviderFactory>
+VSINPUProviderFactoryCreator::Create() {
+  return std::make_shared<onnxruntime::VSINPUProviderFactory>();
+}
+
+}  // namespace onnxruntime
+
+ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_VSINPU,
+                    _In_ OrtSessionOptions* options) {
+  options->provider_factories.push_back(
+      onnxruntime::VSINPUProviderFactoryCreator::Create());
+  return nullptr;
+}
--- a/onnxruntime/core/providers/vsinpu/vsinpu_provider_factory_creator.h
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_provider_factory_creator.h
@ -0,0 +1,34 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#pragma once
+
+#include <memory>
+
+#include "core/providers/providers.h"
+
+namespace onnxruntime {
+struct VSINPUProviderFactoryCreator {
+  static std::shared_ptr<IExecutionProviderFactory> Create();
+};
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/vsinpu_util.cc
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_util.cc
@ -0,0 +1,502 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+
+#include <map>
+#include <algorithm>
+#include <utility>
+#include <unordered_set>
+#include "core/providers/vsinpu/vsinpu_util.h"
+
+#include "core/optimizer/initializer.h"
+#include "core/providers/shared/utils/utils.h"
+namespace onnxruntime {
+
+template <typename T>
+struct shared_array_deletor {
+  void operator()(T const* ptr) { delete[] ptr; }
+};
+namespace vsi {
+namespace npu {
+namespace util {
+tim::vx::DataType OnnxDtypeToTIMVXDtype(const int32_t dtype) {
+  switch (dtype) {
+    case onnx::TensorProto_DataType_FLOAT:
+      return tim::vx::DataType::FLOAT32;
+    case onnx::TensorProto_DataType_FLOAT16:
+      return tim::vx::DataType::FLOAT16;
+    case onnx::TensorProto_DataType_INT8:
+      return tim::vx::DataType::INT8;
+    case onnx::TensorProto_DataType_UINT8:
+      return tim::vx::DataType::UINT8;
+    case onnx::TensorProto_DataType_INT32:
+      return tim::vx::DataType::INT32;
+    case onnx::TensorProto_DataType_INT16:
+      return tim::vx::DataType::INT16;
+    case onnx::TensorProto_DataType_UINT16:
+      return tim::vx::DataType::UINT16;
+    case onnx::TensorProto_DataType_BOOL:
+      return tim::vx::DataType::BOOL8;
+    default:
+      LOGS_DEFAULT(WARNING) << "Unsupported data type: " << dtype;
+      break;
+  }
+  return tim::vx::DataType::FLOAT32;
+}
+
+tim::vx::DataType OnnxDtypeToTIMVXDtype(const ONNX_NAMESPACE::DataType type) {
+  static const std::map<std::string, tim::vx::DataType> type_table = {
+      {"tensor(float)", tim::vx::DataType::FLOAT32},
+      {"tensor(float16)", tim::vx::DataType::FLOAT16},
+      {"tensor(int8)", tim::vx::DataType::INT8},
+      {"tensor(uint8)", tim::vx::DataType::UINT8},
+      {"tensor(int32)", tim::vx::DataType::INT32},
+      {"tensor(int16)", tim::vx::DataType::INT16},
+      {"tensor(uint16)", tim::vx::DataType::UINT16},
+      {"tensor(int64)", tim::vx::DataType::INT64},
+      {"tensor(bool)", tim::vx::DataType::BOOL8},
+  };
+  auto search = type_table.find(*type);
+  if (search != type_table.end()) {
+    return search->second;
+  }
+  LOGS_DEFAULT(WARNING) << "Unsupported data type: " << *type;
+  return tim::vx::DataType::FLOAT32;
+}
+
+tim::vx::ShapeType OnnxShapeToTIMVXShape(const onnxruntime::TensorShape& ts) {
+  tim::vx::ShapeType timvx_shape(ts.NumDimensions());
+  if (ts.NumDimensions() == 0) {
+    timvx_shape.push_back(1);
+  } else {
+    for (size_t i = 0; i < ts.NumDimensions(); i++) {
+      timvx_shape[i] = ts.GetDims()[i];
+    }
+  }
+  return timvx_shape;
+}
+
+std::string PrintNode(const onnxruntime::NodeArg& node_arg) {
+  auto shape = node_arg.Shape();
+  if (shape == nullptr) {
+    return "<null>";
+  }
+  std::string s = node_arg.Name() + ":<";
+  if (shape->dim_size() == 0) {
+    s += "1>, is a scalar";
+    return s;
+  }
+  for (int i = 0; i < shape->dim_size(); i++) {
+    auto dim = shape->dim(i);
+    std::string s1;
+    std::stringstream ss;
+    ss << dim.dim_value();
+    ss >> s1;
+    s += s1;
+    if (i < shape->dim_size() - 1) {
+      s += ",";
+    } else {
+      s += ">";
+    }
+  }
+  return s;
+}
+
+std::string PrintNode(const std::vector<int64_t> shape) {
+  if (shape.size() == 0) {
+    return "<null>";
+  }
+  std::string s = "<";
+  for (std::size_t i = 0; i < shape.size(); i++) {
+    auto dim = shape[i];
+    std::string s1;
+    std::stringstream ss;
+    ss << dim;
+    ss >> s1;
+    s += s1;
+    if (i < shape.size() - 1) {
+      s += ",";
+    } else {
+      s += ">";
+    }
+  }
+  return s;
+}
+
+size_t GetTensorElementSize(const ONNXTensorElementDataType type) {
+  switch (type) {
+    case onnx::TensorProto_DataType_INT64:
+      return 8;
+    case onnx::TensorProto_DataType_FLOAT:
+    case onnx::TensorProto_DataType_INT32:
+      return 4;
+    case onnx::TensorProto_DataType_FLOAT16:
+    case onnx::TensorProto_DataType_INT16:
+    case onnx::TensorProto_DataType_UINT16:
+      return 2;
+    case onnx::TensorProto_DataType_INT8:
+    case onnx::TensorProto_DataType_UINT8:
+    case onnx::TensorProto_DataType_BOOL:
+      return 1;
+    default:
+      break;
+  }
+  return 0;
+}
+
+size_t GetTensorBytes(const Ort::TensorTypeAndShapeInfo& info) {
+  return info.GetElementCount() * GetTensorElementSize(info.GetElementType());
+}
+
+TensorShape GetTensorShape(const onnxruntime::NodeArg& node_arg) {
+  auto shape_proto = node_arg.Shape();
+  std::vector<int64_t> dims;
+  if (shape_proto != nullptr) {
+    for (int i = 0; i < shape_proto->dim_size(); i++) {
+      auto dim = shape_proto->dim(i);
+      dims.push_back(dim.dim_value());
+    }
+  }
+  if (dims.size() == 0) {
+    dims.push_back(1);
+  }
+  TensorShape ts(dims);
+  return ts;
+}
+
+std::shared_ptr<uint8_t> UnpackTensor(
+    const NodeArg* node_arg, const ONNX_NAMESPACE::TensorProto& initializer) {
+  std::shared_ptr<uint8_t> unpackedTensor;
+  auto shape = GetTensorShape(*node_arg);
+  size_t elementCount = shape.Size();
+
+#define CASE_PROTO(X, Y)                                                      \
+  case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_##X: {      \
+    size_t tensorByteSize = elementCount * sizeof(Y);                         \
+    unpackedTensor.reset(new uint8_t[tensorByteSize],                         \
+                         shared_array_deletor<uint8_t>());                    \
+    auto status = onnxruntime::utils::UnpackTensor(                           \
+        initializer,                                                          \
+        initializer.has_raw_data() ? initializer.raw_data().data() : nullptr, \
+        initializer.has_raw_data() ? initializer.raw_data().size() : 0,       \
+        reinterpret_cast<Y*>(unpackedTensor.get()), elementCount);            \
+    if (!status.IsOK()) {                                                     \
+      LOGS_DEFAULT(ERROR) << "Unpack tensor data failed.";                    \
+    }                                                                         \
+    break;                                                                    \
+  }
+  switch (initializer.data_type()) {
+    CASE_PROTO(FLOAT, float);
+    CASE_PROTO(DOUBLE, double);
+    CASE_PROTO(BOOL, bool);
+    CASE_PROTO(INT8, int8_t);
+    CASE_PROTO(INT16, int16_t);
+    CASE_PROTO(INT32, int32_t);
+    CASE_PROTO(INT64, int64_t);
+    CASE_PROTO(UINT8, uint8_t);
+    CASE_PROTO(UINT16, uint16_t);
+    CASE_PROTO(UINT32, uint32_t);
+    CASE_PROTO(FLOAT16, onnxruntime::MLFloat16);
+    default:
+      return nullptr;
+  }
+
+  return unpackedTensor;
+}
+
+tim::vx::PadType GetPadType(const std::string type) {
+  static const std::map<std::string, tim::vx::PadType> type_table = {
+      {"NOTSET", tim::vx::PadType::AUTO},
+      {"SAME_UPPER", tim::vx::PadType::SAME},
+      {"SAME_LOWER", tim::vx::PadType::SAME},
+      {"VALID", tim::vx::PadType::VALID},
+  };
+  auto search = type_table.find(type);
+  if (search != type_table.end()) {
+    return search->second;
+  }
+  return tim::vx::PadType::NONE;
+}
+
+int32_t ReverseAxis(int32_t origin_axis, int32_t length) {
+  int32_t axis = 0;
+  if (origin_axis < 0) {
+    origin_axis += length;
+  }
+  axis = length - origin_axis - 1;
+  return axis;
+}
+
+std::vector<int32_t> ReverseAxis(std::vector<int32_t> origin_axes, int32_t length) {
+  std::vector<int32_t> axes;
+  for (int32_t& axis : origin_axes) {
+    if (axis < 0) {
+      axis += length;
+    }
+    axes.push_back(length - axis - 1);
+  }
+  std::sort(axes.begin(), axes.end());
+  return axes;
+}
+
+bool IsTypeSupported(const NodeArg* node_arg) {
+  const auto* type_proto = node_arg->TypeAsProto();
+  if (!type_proto) {
+    return false;
+  }
+
+  switch (type_proto->tensor_type().elem_type()) {
+    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BOOL:
+    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT:
+    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16:
+    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8:
+    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT8:
+    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32:
+    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT64:
+      return true;
+    default:
+      return false;
+  }
+}
+
+QuantizedOpType GetQuantizedOpType(const NodeUnit& node_unit) {
+  const auto& op_type = node_unit.OpType();
+  if (node_unit.UnitType() == NodeUnit::Type::SingleNode) {
+    if (op_type == "DequantizeLinear")
+      return QuantizedOpType::DequantizeLinear;
+    else if (op_type == "QuantizeLinear")
+      return QuantizedOpType::QuantizeLinear;
+    else if (op_type == "QLinearConv")
+      return QuantizedOpType::QLinearConv;
+    else if (op_type == "QLinearMatMul")
+      return QuantizedOpType::QLinearMatMul;
+    else if (op_type == "QLinearAdd")
+      return QuantizedOpType::QLinearAdd;
+    else if (op_type == "QLinearMul")
+      return QuantizedOpType::QLinearMul;
+    else if (op_type == "QLinearSigmoid")
+      return QuantizedOpType::QLinearSigmoid;
+    else if (op_type == "QLinearAveragePool")
+      return QuantizedOpType::QLinearAveragePool;
+  } else if (node_unit.UnitType() == NodeUnit::Type::QDQGroup) {
+    if (op_type == "Conv")
+      return QuantizedOpType::QDQConv;
+    else if (op_type == "Resize")
+      return QuantizedOpType::QDQResize;
+    else if (op_type == "AveragePool")
+      return QuantizedOpType::QDQAveragePool;
+    else if (op_type == "Add")
+      return QuantizedOpType::QDQAdd;
+    else if (op_type == "Mul")
+      return QuantizedOpType::QDQMul;
+    else if (op_type == "Transpose")
+      return QuantizedOpType::QDQTranspose;
+    else if (op_type == "Reshape")
+      return QuantizedOpType::QDQReshape;
+    else if (op_type == "Softmax")
+      return QuantizedOpType::QDQSoftmax;
+    else if (op_type == "Concat")
+      return QuantizedOpType::QDQConcat;
+    else if (op_type == "Gemm")
+      return QuantizedOpType::QDQGemm;
+    else if (op_type == "MatMul")
+      return QuantizedOpType::QDQMatMul;
+  }
+  return QuantizedOpType::Unknown;
+}
+
+ConvType GetConvType(const NodeUnit& node_unit, const InitializedTensorSet& initializers) {
+  NodeAttrHelper helper(node_unit);
+  const auto group = helper.Get("group", 1);
+
+  const auto& weight = node_unit.Inputs()[1].node_arg.Name();
+  const auto& weight_tensor = *initializers.at(weight);
+
+  // For ONNX we only have 1 conv ops
+  // For VSINPU we have 3
+  // Input is (W, H, C, N)
+  // group == 1,                                   --> regular conv
+  // group != 1 && weight is (kW, kH, group, M),       --> depthwise conv
+  // group != 1 && weight is (kW, kH, C/group, M), --> grouped conv
+  if (group == 1)
+    return ConvType::Regular;
+  else if ((weight_tensor.dims()[1] == group))
+    return ConvType::Depthwise;
+  else
+    return ConvType::Grouped;
+}
+
+bool IsQuantizedConv(QuantizedOpType quant_op_type) {
+  return (quant_op_type == QuantizedOpType::QLinearConv) ||
+         (quant_op_type == QuantizedOpType::QDQConv);
+}
+
+bool IsQuantizedPool(QuantizedOpType quant_op_type) {
+  return (quant_op_type == QuantizedOpType::QLinearAveragePool) ||
+         (quant_op_type == QuantizedOpType::QDQAveragePool);
+}
+
+bool IsQuantizedGemm(QuantizedOpType quant_op_type) {
+  return (quant_op_type == QuantizedOpType::QLinearMatMul) ||
+         (quant_op_type == QuantizedOpType::QDQGemm) ||
+         (quant_op_type == QuantizedOpType::QDQMatMul);
+}
+
+bool IsQuantizedBinaryOp(QuantizedOpType quant_op_type) {
+  return quant_op_type == QuantizedOpType::QLinearMatMul ||
+         quant_op_type == QuantizedOpType::QLinearAdd ||
+         quant_op_type == QuantizedOpType::QLinearMul ||
+         quant_op_type == QuantizedOpType::QDQAdd ||
+         quant_op_type == QuantizedOpType::QDQMul ||
+         quant_op_type == QuantizedOpType::QDQGemm ||
+         quant_op_type == QuantizedOpType::QDQMatMul ||
+         IsQuantizedConv(quant_op_type);
+}
+
+bool HasValidBinaryOpQuantizedInputTypes(const NodeUnit& node_unit) {
+  auto quant_op_type = GetQuantizedOpType(node_unit);
+  int32_t a_input_type, b_input_type;
+  if (!IsQuantizedBinaryOp(quant_op_type)) {
+    LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType() << "] is not a binary qlinear op";
+    return false;
+  }
+
+  const auto& inputs = node_unit.Inputs();
+  if (!GetType(inputs[0].node_arg, a_input_type))
+    return false;
+  if (!GetType(inputs[1].node_arg, b_input_type))
+    return false;
+
+  // QlinearConv/MatMul/QDQGemm/QDQMatMul supports u8u8 or u8s8
+  // QLinearAdd/QLinearMul only support u8u8
+  bool is_quant_conv_or_gemm = IsQuantizedConv(quant_op_type) || IsQuantizedGemm(quant_op_type);
+
+  bool has_valid_qlinear_conv_weight =
+      (b_input_type == ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
+       b_input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8);
+
+  bool has_valid_qlinear_conv_input =
+      (a_input_type == ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
+       a_input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8);
+
+  if ((is_quant_conv_or_gemm && !has_valid_qlinear_conv_weight) ||
+      (!is_quant_conv_or_gemm && a_input_type != b_input_type)) {
+    LOGS_DEFAULT(VERBOSE) << "[" << node_unit.OpType()
+                          << "] A Input type: [" << a_input_type
+                          << "] B Input type: [" << b_input_type
+                          << "] is not supported for now";
+    return false;
+  }
+
+  return true;
+}
+
+void GetQuantizationScaleAndZeroPoint(
+    const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
+    float& scale, int32_t& zero_point, std::optional<std::vector<float>>& pcq_scales,
+    std::optional<std::vector<int32_t>>& pcq_zps) {
+  scale = 0.0f;
+  zero_point = 0;
+
+  const auto& quant_param = *io_def.quant_param;
+  {  // get the scale
+    const auto& name = quant_param.scale.Name();
+    Initializer unpacked_tensor(*initializers.at(name), model_path);
+    scale = unpacked_tensor.DataAsSpan<float>()[0];
+
+    // per channel quantized handling
+    if (!unpacked_tensor.dims().empty() && unpacked_tensor.dims()[0] != 0 && unpacked_tensor.dims()[0] != 1) {
+      auto scales = unpacked_tensor.DataAsSpan<float>();
+      std::vector<float> scales_vec(scales.begin(), scales.end());
+      pcq_scales = onnxruntime::make_optional(std::move(scales_vec));
+    }
+  }
+
+  if (quant_param.zero_point) {  // get the zero point if it exists
+    const auto& name = quant_param.zero_point->Name();
+    Initializer unpacked_tensor(*initializers.at(name), model_path);
+    bool is_i8_zp = unpacked_tensor.data_type() == onnx::TensorProto_DataType_INT8;
+    // some qdq conv bias is int32 quantized
+    bool is_int32_zp = unpacked_tensor.data_type() == onnx::TensorProto_DataType_INT32;
+    zero_point = is_i8_zp ? static_cast<int32_t>(unpacked_tensor.DataAsSpan<int8_t>()[0]) : is_int32_zp ? static_cast<int32_t>(unpacked_tensor.DataAsSpan<int32_t>()[0])
+                                                                                                        : static_cast<int32_t>(unpacked_tensor.DataAsByteSpan()[0]);
+
+    // per channel quantized handling
+    if (!unpacked_tensor.dims().empty() && unpacked_tensor.dims()[0] != 0 && unpacked_tensor.dims()[0] != 1) {
+      auto type = unpacked_tensor.data_type();
+      if (is_i8_zp) {
+        auto zps = unpacked_tensor.DataAsSpan<int8_t>();
+        std::vector<int32_t> zps_vec(zps.begin(), zps.end());
+        pcq_zps = onnxruntime::make_optional(std::move(zps_vec));
+      } else if (is_int32_zp) {
+        auto zps = unpacked_tensor.DataAsByteSpan();
+        std::vector<int32_t> zps_vec(zps.begin(), zps.end());
+        pcq_zps = onnxruntime::make_optional(std::move(zps_vec));
+      } else {
+        auto zps = unpacked_tensor.DataAsSpan<int32_t>();
+        std::vector<int32_t> zps_vec(zps.begin(), zps.end());
+        pcq_zps = onnxruntime::make_optional(std::move(zps_vec));
+      }
+    }
+  }
+}
+
+static bool IsInternalQuantizedNodeUnit(const NodeUnit& node_unit) {
+  // First, ignore QDQ NodeUnit which is not internal quantized node
+  if (node_unit.UnitType() == NodeUnit::Type::QDQGroup)
+    return false;
+
+  // These operators can use uint8 input without specific QLinear version of it
+  // However, the mode has to be internal to the graph/partition (they cannot consume graph inputs)
+  static const std::unordered_set<std::string> internal_quantized_op_types = {
+      "Transpose",
+      "Resize",
+      "Concat",
+      "MaxPool",
+  };
+
+  const auto& node = node_unit.GetNode();
+  if (!Contains(internal_quantized_op_types, node.OpType()))
+    return false;
+
+  int32_t input_type;
+  ORT_ENFORCE(GetType(*node.InputDefs()[0], input_type));
+
+  return input_type == ONNX_NAMESPACE::TensorProto_DataType_UINT8 || input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8;
+}
+
+bool GetType(const NodeArg& node_arg, int32_t& type) {
+  type = ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED;
+  const auto* type_proto = node_arg.TypeAsProto();
+  if (!type_proto || !type_proto->has_tensor_type() || !type_proto->tensor_type().has_elem_type()) {
+    LOGS_DEFAULT(WARNING) << "NodeArg [" << node_arg.Name() << "] has no input type";
+    return false;
+  }
+
+  type = type_proto->tensor_type().elem_type();
+  return true;
+}
+}  // namespace util
+}  // namespace npu
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/vsinpu/vsinpu_util.h
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_util.h
@ -0,0 +1,131 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "core/framework/op_kernel.h"
+#include "core/framework/tensor_type_and_shape.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/session/onnxruntime_cxx_api.h"
+#include "core/framework/node_unit.h"
+#include "tim/vx/tensor.h"
+#include "tim/vx/types.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+namespace util {
+
+tim::vx::DataType OnnxDtypeToTIMVXDtype(const int32_t dtype);
+
+tim::vx::DataType OnnxDtypeToTIMVXDtype(const ONNX_NAMESPACE::DataType type);
+
+tim::vx::ShapeType OnnxShapeToTIMVXShape(const onnxruntime::TensorShape& ts);
+
+std::string PrintNode(const onnxruntime::NodeArg& node_arg);
+
+std::string PrintNode(const std::vector<int64_t> shape);
+
+size_t GetTensorElementSize(const ONNXTensorElementDataType type);
+
+size_t GetTensorBytes(const Ort::TensorTypeAndShapeInfo& info);
+
+TensorShape GetTensorShape(const onnxruntime::NodeArg& node_arg);
+
+std::shared_ptr<uint8_t> UnpackTensor(
+    const NodeArg* node, const ONNX_NAMESPACE::TensorProto& initializer);
+
+tim::vx::PadType GetPadType(const std::string type);
+
+int32_t ReverseAxis(int32_t origin_axis, int32_t length);
+
+std::vector<int32_t> ReverseAxis(std::vector<int32_t> origin_axes, int32_t length);
+
+bool IsTypeSupported(const NodeArg* node_arg);
+
+enum class QuantizedOpType : uint8_t {
+  Unknown,  // Unknown or not a quantized NodeUnit
+  DequantizeLinear,
+  QuantizeLinear,
+  QLinearConv,
+  QLinearMatMul,
+  QLinearAdd,
+  QLinearSigmoid,
+  QLinearAveragePool,
+  QLinearMul,
+  // Not yet supported
+  // QLinearReduceMean,
+  QDQConv,
+  QDQResize,
+  QDQAveragePool,
+  QDQAdd,
+  QDQMul,
+  QDQTranspose,
+  QDQReshape,
+  QDQSoftmax,
+  QDQConcat,
+  QDQGemm,
+  QDQMatMul,
+  // TODO(cfy) :Add other QDQ NodeUnit types
+};
+
+enum class ConvType : uint8_t {
+  Regular,
+  Depthwise,
+  Grouped,
+};
+QuantizedOpType GetQuantizedOpType(const NodeUnit& node_unit);
+
+ConvType GetConvType(const NodeUnit& node_unit, const InitializedTensorSet& initializers);
+
+// If this is a quantized Conv (QLinearConv or QDQConv)
+bool IsQuantizedConv(QuantizedOpType quant_op_type);
+
+// If this is a quantized Pool (QLinearAveragePool or QDQAveragePool)
+bool IsQuantizedPool(QuantizedOpType quant_op_type);
+
+// If this is a quantized Gemm (QLinearMatMul or QDQMatMul/QDQGemm)
+bool IsQuantizedGemm(QuantizedOpType quant_op_type);
+
+// This quantized op is an operator or qdq node unit takes 2 inputs and produces 1 output
+// Such as QLinearConv, QLinearMatMul, QLinearAdd, QDQConv,...
+bool IsQuantizedBinaryOp(QuantizedOpType quant_op_type);
+
+// Check if a qlinear binary op has valid inputs, Qlinear[Conv/MatMul/Add]
+bool HasValidBinaryOpQuantizedInputTypes(const NodeUnit& node_unit);
+
+void GetQuantizationScaleAndZeroPoint(
+    const InitializedTensorSet& initializers, const NodeUnitIODef& io_def, const Path& model_path,
+    float& scale, int32_t& zero_point,
+    std::optional<std::vector<float>>& pcq_scales,
+    std::optional<std::vector<int32_t>>& pcq_zps);
+
+bool GetType(const NodeArg& node_arg, int32_t& type);
+
+}  // namespace util
+}  // namespace npu
+}  // namespace vsi
+}  // namespace onnxruntime
--- a/onnxruntime/test/onnx/TestCase.cc
+++ b/onnxruntime/test/onnx/TestCase.cc
@ -796,7 +796,7 @@ void LoadTests(const std::vector<std::basic_string<PATH_CHAR_TYPE>>& input_paths
      auto test_case_dir = model_info->GetDir();
      auto test_case_name_in_log = test_case_name + ORT_TSTR(" in ") + test_case_dir.native();

-#if !defined(ORT_MINIMAL_BUILD) && !defined(USE_QNN)
+#if !defined(ORT_MINIMAL_BUILD) && !defined(USE_QNN) && !defined(USE_VSINPU)
      // to skip some models like *-int8 or *-qdq
      if ((reinterpret_cast<OnnxModelInfo*>(model_info.get()))->HasDomain(ONNX_NAMESPACE::AI_ONNX_TRAINING_DOMAIN) ||
          (reinterpret_cast<OnnxModelInfo*>(model_info.get()))->HasDomain(ONNX_NAMESPACE::AI_ONNX_PREVIEW_TRAINING_DOMAIN)) {
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@ -44,7 +44,7 @@ void usage() {
      "\t-r [repeat]: Specifies the number of times to repeat\n"
      "\t-v: verbose\n"
      "\t-n [test_case_name]: Specifies a single test case to run.\n"
-      "\t-e [EXECUTION_PROVIDER]: EXECUTION_PROVIDER could be 'cpu', 'cuda', 'dnnl', 'tensorrt', "
+      "\t-e [EXECUTION_PROVIDER]: EXECUTION_PROVIDER could be 'cpu', 'cuda', 'dnnl', 'tensorrt', 'vsinpu'"
      "'openvino', 'rocm', 'migraphx', 'acl', 'armnn', 'xnnpack', 'nnapi', 'qnn', 'snpe' or 'coreml'. "
      "Default: 'cpu'.\n"
      "\t-p: Pause after launch, can attach debugger and continue\n"
@ -169,6 +169,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
  bool enable_mem_pattern = true;
  bool enable_qnn = false;
  bool enable_nnapi = false;
+  bool enable_vsinpu = false;
  bool enable_coreml = false;
  bool enable_snpe = false;
  bool enable_dml = false;
@ -248,6 +249,8 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
            enable_qnn = true;
          } else if (!CompareCString(optarg, ORT_TSTR("nnapi"))) {
            enable_nnapi = true;
+          } else if (!CompareCString(optarg, ORT_TSTR("vsinpu"))) {
+            enable_vsinpu = true;
          } else if (!CompareCString(optarg, ORT_TSTR("coreml"))) {
            enable_coreml = true;
          } else if (!CompareCString(optarg, ORT_TSTR("snpe"))) {
@ -561,6 +564,14 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
 #else
      fprintf(stderr, "NNAPI is not supported in this build");
      return -1;
+#endif
+    }
+    if (enable_vsinpu) {
+#ifdef USE_VSINPU
+      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_VSINPU(sf));
+#else
+      fprintf(stderr, "VSINPU is not supported in this build");
+      return -1;
 #endif
    }
    if (enable_coreml) {
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@ -261,6 +261,8 @@ static bool ParseSessionConfigs(const std::string& configs_string,
          test_config.machine_config.provider_type_name = onnxruntime::kSnpeExecutionProvider;
        } else if (!CompareCString(optarg, ORT_TSTR("nnapi"))) {
          test_config.machine_config.provider_type_name = onnxruntime::kNnapiExecutionProvider;
+        } else if (!CompareCString(optarg, ORT_TSTR("vsinpu"))) {
+          test_config.machine_config.provider_type_name = onnxruntime::kVSINPUExecutionProvider;
        } else if (!CompareCString(optarg, ORT_TSTR("coreml"))) {
          test_config.machine_config.provider_type_name = onnxruntime::kCoreMLExecutionProvider;
        } else if (!CompareCString(optarg, ORT_TSTR("dml"))) {
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@ -397,6 +397,12 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Nnapi(session_options, nnapi_flags));
 #else
    ORT_THROW("NNAPI is not supported in this build\n");
+#endif
+  } else if (provider_name_ == onnxruntime::kVSINPUExecutionProvider) {
+#ifdef USE_VSINPU
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_VSINPU(session_options));
+#else
+    ORT_THROW("VSINPU is not supported in this build\n");
 #endif
  } else if (provider_name_ == onnxruntime::kCoreMLExecutionProvider) {
 #ifdef __APPLE__
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@ -428,6 +428,7 @@ bool SetEpsForAllNodes(Graph& graph,
      if (provider_type == onnxruntime::kOpenVINOExecutionProvider ||
          provider_type == onnxruntime::kTensorrtExecutionProvider ||
          provider_type == onnxruntime::kNnapiExecutionProvider ||
+          provider_type == onnxruntime::kVSINPUExecutionProvider ||
          provider_type == onnxruntime::kCoreMLExecutionProvider ||
          provider_type == onnxruntime::kDnnlExecutionProvider ||
          provider_type == onnxruntime::kQnnExecutionProvider ||
@ -649,6 +650,7 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter,
          kAclExecutionProvider,
          kArmNNExecutionProvider,
          kNnapiExecutionProvider,
+          kVSINPUExecutionProvider,
          kRocmExecutionProvider,
          kCoreMLExecutionProvider,
          kCoreMLExecutionProviderMLProgram,
@ -688,6 +690,8 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter,
          execution_provider = DefaultTensorrtExecutionProvider();
        else if (provider_type == onnxruntime::kNnapiExecutionProvider)
          execution_provider = DefaultNnapiExecutionProvider();
+        else if (provider_type == onnxruntime::kVSINPUExecutionProvider)
+          execution_provider = DefaultVSINPUExecutionProvider();
        else if (provider_type == onnxruntime::kRknpuExecutionProvider)
          execution_provider = DefaultRknpuExecutionProvider();
        else if (provider_type == onnxruntime::kAclExecutionProvider)
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@ -30,6 +30,10 @@
 #include "core/providers/nnapi/nnapi_provider_factory.h"
 #endif

+#ifdef USE_VSINPU
+#include "core/providers/vsinpu/vsinpu_provider_factory.h"
+#endif
+
 #ifdef USE_RKNPU
 #include "core/providers/rknpu/rknpu_provider_factory.h"
 #endif
@ -238,6 +242,11 @@ TEST_P(ModelTest, Run) {
        ASSERT_ORT_STATUS_OK(OrtSessionOptionsAppendExecutionProvider_Nnapi(ortso, 0));
      }
 #endif
+#ifdef USE_VSINPU
+      else if (provider_name == "vsinpu") {
+        ASSERT_ORT_STATUS_OK(OrtSessionOptionsAppendExecutionProvider_VSINPU(ortso));
+      }
+#endif
 #ifdef USE_RKNPU
      else if (provider_name == "rknpu") {
        ASSERT_ORT_STATUS_OK(OrtSessionOptionsAppendExecutionProvider_Rknpu(ortso));
@ -406,6 +415,9 @@ static constexpr ORT_STRING_VIEW provider_name_dnnl = ORT_TSTR("dnnl");
 #if defined(USE_NNAPI) && defined(__ANDROID__)
 static constexpr ORT_STRING_VIEW provider_name_nnapi = ORT_TSTR("nnapi");
 #endif
+#ifdef USE_VSINPU
+static ORT_STRING_VIEW provider_name_vsinpu = ORT_TSTR("vsinpu");
+#endif
 #ifdef USE_RKNPU
 static constexpr ORT_STRING_VIEW provider_name_rknpu = ORT_TSTR("rknpu");
 #endif
@ -447,6 +459,9 @@ static constexpr ORT_STRING_VIEW provider_name_dml = ORT_TSTR("dml");
 #if defined(USE_NNAPI) && defined(__ANDROID__)
  provider_names[provider_name_nnapi] = {opset7, opset8, opset9, opset10, opset11, opset12, opset13, opset14, opset15, opset16, opset17, opset18};
 #endif
+#ifdef USE_VSINPU
+  provider_names[provider_name_vsinpu] = {};
+#endif
 #ifdef USE_RKNPU
  provider_names[provider_name_rknpu] = {};
 #endif
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@ -190,6 +190,14 @@ std::unique_ptr<IExecutionProvider> DefaultNnapiExecutionProvider() {
 #endif
 }

+std::unique_ptr<IExecutionProvider> DefaultVSINPUExecutionProvider() {
+#if defined(USE_VSINPU)
+  return VSINPUProviderFactoryCreator::Create()->CreateProvider();
+#else
+  return nullptr;
+#endif
+}
+
 std::unique_ptr<IExecutionProvider> DefaultRknpuExecutionProvider() {
 #ifdef USE_RKNPU
  return RknpuProviderFactoryCreator::Create()->CreateProvider();
--- a/onnxruntime/test/util/include/default_providers.h
+++ b/onnxruntime/test/util/include/default_providers.h
@ -20,6 +20,7 @@ std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_MIGrap
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Nnapi(
    uint32_t flags, const optional<std::string>& partitioning_stop_ops_list);
 // std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tvm(const char*);
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_VSINPU();
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Rknpu();
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Rocm(const OrtROCMProviderOptions* provider_options);
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* params);
@ -50,6 +51,7 @@ std::unique_ptr<IExecutionProvider> MIGraphXExecutionProviderWithOptions(const O
 std::unique_ptr<IExecutionProvider> OpenVINOExecutionProviderWithOptions(const OrtOpenVINOProviderOptions* params);
 std::unique_ptr<IExecutionProvider> DefaultOpenVINOExecutionProvider();
 std::unique_ptr<IExecutionProvider> DefaultNnapiExecutionProvider();
+std::unique_ptr<IExecutionProvider> DefaultVSINPUExecutionProvider();
 std::unique_ptr<IExecutionProvider> DefaultRknpuExecutionProvider();
 std::unique_ptr<IExecutionProvider> DefaultAclExecutionProvider(bool enable_arena = true);
 std::unique_ptr<IExecutionProvider> DefaultArmNNExecutionProvider(bool enable_arena = true);
--- a/onnxruntime/test/util/include/providers.h
+++ b/onnxruntime/test/util/include/providers.h
@ -16,6 +16,9 @@
 #ifdef USE_NNAPI
 #include "core/providers/nnapi/nnapi_provider_factory.h"
 #endif
+#ifdef USE_VSINPU
+#include "core/providers/vsinpu/vsinpu_provider_factory.h"
+#endif
 #ifdef USE_COREML
 #include "core/providers/coreml/coreml_provider_factory.h"
 #endif
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@ -561,6 +561,7 @@ def parse_arguments():
    parser.add_argument("--use_snpe", action="store_true", help="Build with SNPE support.")
    parser.add_argument("--snpe_root", help="Path to SNPE SDK root.")
    parser.add_argument("--use_nnapi", action="store_true", help="Build with NNAPI support.")
+    parser.add_argument("--use_vsinpu", action="store_true", help="Build with VSINPU support.")
    parser.add_argument(
        "--nnapi_min_api", type=int, help="Minimum Android API level to enable NNAPI, should be no less than 27"
    )
@ -1020,6 +1021,7 @@ def generate_build_tree(
        "-Donnxruntime_BUILD_APPLE_FRAMEWORK=" + ("ON" if args.build_apple_framework else "OFF"),
        "-Donnxruntime_USE_DNNL=" + ("ON" if args.use_dnnl else "OFF"),
        "-Donnxruntime_USE_NNAPI_BUILTIN=" + ("ON" if args.use_nnapi else "OFF"),
+        "-Donnxruntime_USE_VSINPU=" + ("ON" if args.use_vsinpu else "OFF"),
        "-Donnxruntime_USE_RKNPU=" + ("ON" if args.use_rknpu else "OFF"),
        "-Donnxruntime_USE_LLVM=" + ("ON" if args.use_tvm else "OFF"),
        "-Donnxruntime_ENABLE_MICROSOFT_INTERNAL=" + ("ON" if args.enable_msinternal else "OFF"),
				`@ -0,0 +1 @@`
				`OrtSessionOptionsAppendExecutionProvider_VSINPU`