diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 3e3583b445..eba8c0a0bd 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -343,6 +343,11 @@ if (onnxruntime_USE_ACL)
   list(APPEND onnxruntime_EXTERNAL_LIBRARIES arm_compute acl arm_compute_graph arm_compute_core)
 endif()
 
+# MKLML
+if (onnxruntime_USE_MKLDNN OR onnxruntime_USE_MKLML)
+  include(mkldnn)
+endif()
+
 # TVM
 if (onnxruntime_USE_TVM)
   if (onnxruntime_USE_CUDA)
@@ -352,6 +357,19 @@ if (onnxruntime_USE_TVM)
     set(USE_LLVM ON)
     add_definitions(-DUSE_TVM_WITH_LLVM)
   endif()
+  if (onnxruntime_USE_OPENMP)
+    set(USE_OPENMP "gnu")
+  endif()
+  if (onnxruntime_USE_MKLML)
+    set(USE_OPENMP "intel")
+    # make sure MKLML in ORT is used by TVM
+    if (WIN32)
+      set(OMP_LIBRARY ${MKLML_LIB_DIR}/${IOMP5MD_IMPORT_LIB})
+    else()
+      set(OMP_LIBRARY ${MKLML_LIB_DIR}/${IOMP5MD_SHARED_LIB})
+    endif()
+  endif()
+
   add_subdirectory(${PROJECT_SOURCE_DIR}/external/tvm EXCLUDE_FROM_ALL)
   set_target_properties(tvm PROPERTIES FOLDER "External/tvm")
   set_target_properties(tvm_topi PROPERTIES FOLDER "External/tvm")
@@ -501,10 +519,6 @@ include_directories(
   ${REPO_ROOT}/include/onnxruntime/core/session
 )
 
-if (onnxruntime_USE_MKLDNN OR onnxruntime_USE_MKLML)
-  include(mkldnn)
-endif()
-
 if(onnxruntime_USE_GEMMLOWP)
   add_definitions(-DUSE_GEMMLOWP=1)
 endif()
diff --git a/cmake/external/tvm b/cmake/external/tvm
index 9b3a424a91..c6e3efcdb0 160000
--- a/cmake/external/tvm
+++ b/cmake/external/tvm
@@ -1 +1 @@
-Subproject commit 9b3a424a91d6003db1993cdd7121e46696f220e8
+Subproject commit c6e3efcdb09aeda961a6badf76093ceac69db64d
diff --git a/docs/execution_providers/Nuphar-ExecutionProvider.md b/docs/execution_providers/Nuphar-ExecutionProvider.md
index 05c6d57bfc..17244fcb8d 100644
--- a/docs/execution_providers/Nuphar-ExecutionProvider.md
+++ b/docs/execution_providers/Nuphar-ExecutionProvider.md
@@ -17,6 +17,12 @@ You can use the Nuphar execution provider via the python wheel from the ONNX Run
 ## Performance and Accuracy Testing
 You can test your ONNX model's performance with [onnxruntime_perf_test](../../onnxruntime/test/perftest/README.md), or test accuracy with [onnx_test_runner](../../onnxruntime/test/onnx/README.txt). To run these tools with the Nuphar execution provider, please pass `-e nuphar` in command line options.
 
+Please note that Nuphar uses TVM thread pool and parallel schedule for multi-thread inference performance. When building with OpenMP or MKLML, TVM thread pool would use gomp or iomp as its implementation; otherwise, TVM creates its own thread pool. Because of this, the current default parallel schedule policy is:
+- Default to on for USE_OPENMP or USE_MKLML. User can use OMP_NUM_THREADS/MKL_NUM_THREADS to control TVM thread pool, as well as TVM_NUM_THREADS
+- Default to off for none of above. User can use TVM_NUM_THREADS to control TVM thread pool.
+
+This choice is to ensure to get ideal performance with the different build options. When build with USE_OPENMP or USE_MKLML, users would have to avoid thread confliction from OpenMP or MKL with their inference invocations anyway, so parallel schedule is enable to leverage existing thread pool. When not building with gomp or iomp, TVM thread pool is turned off to avoid confliction with user threads. If needed, user can set env or settings with [NUPHAR_PARALLEL_MIN_WORKLOADS](../../onnxruntime/core/providers/nuphar/common/nuphar_settings.cc#L61) to 0 to disable parallel schedule, or to some non-zero value to enable parallel schedule. The non-zero value indicates the minimal number of elements being computed per thread when parallel schedule would be turned on.
+
 ## Model Conversion and Quantization
 You may use Python script [model_editor.py](../../onnxruntime/core/providers/nuphar/scripts/model_editor.py) to turn LSTM/GRU/RNN ops to Scan ops for a given model, and then use [model_quantizer.py](../../onnxruntime/core/providers/nuphar/scripts/model_quantizer.py) to quantize MatMul ops into MatMulInteger ops.
 
diff --git a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc
index 8e86f13eaa..3595229bbe 100644
--- a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc
+++ b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc
@@ -58,6 +58,19 @@ bool InsertRootScheduleAndClosure(
   return true;
 }
 
+// Check precondition for vectorize schedule
+bool ShouldTryVectorization(
+    const tvm::Tensor& tensor,
+    ScheduleContext& ctx) {
+  auto it = ctx.scheduled_tensors.find(tensor->op.get());
+  if (it != ctx.scheduled_tensors.end()) {
+    if (it->second > ScheduleType::ScheduleInline) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // Check the schedule of tensor
 // If it is not scheduled, try to vectorize it.
 // Note TryVectorization has to use with compute_root.
@@ -66,12 +79,8 @@ bool TryVectorization(
     const tvm::Tensor& tensor,
     int64_t natural_vector_size,
     ScheduleContext& ctx) {
-  auto it = ctx.scheduled_tensors.find(tensor->op.get());
-  if (it != ctx.scheduled_tensors.end()) {
-    if (it->second > ScheduleType::ScheduleInline) {
-      return false;
-    }
-  }
+  if (!ShouldTryVectorization(tensor, ctx))
+    return false;
 
   auto shape = tensor->shape;
   auto rank = shape.size();
diff --git a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h
index f928928f30..757366b551 100644
--- a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h
+++ b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h
@@ -26,6 +26,11 @@ bool InsertRootScheduleAndClosure(
     const tvm::Tensor& tensor,
     ScheduleContext& ctx);
 
+// Check precondition for vectorize schedule
+bool ShouldTryVectorization(
+    const tvm::Tensor& tensor,
+    ScheduleContext& ctx);
+
 // Check the schedule of tensor
 // If it is not scheduled, try to vectorize it.
 // Note TryVectorization has to use with compute_root.
diff --git a/onnxruntime/core/providers/nuphar/common/nuphar_settings.cc b/onnxruntime/core/providers/nuphar/common/nuphar_settings.cc
index 389e4f817b..271e12f50c 100644
--- a/onnxruntime/core/providers/nuphar/common/nuphar_settings.cc
+++ b/onnxruntime/core/providers/nuphar/common/nuphar_settings.cc
@@ -38,7 +38,8 @@ static const std::unordered_set<std::string> valid_keys = {
     kNupharCacheSoName,
     kNupharCacheModelChecksum,
     kNupharCacheForceNoJIT,
-    kNupharCodeGenTarget};
+    kNupharCodeGenTarget,
+    kNupharParallelMinWorkloads};
 
 void SetDefaultOptions(std::map<std::string, std::string>& options) {
   // create two temporary strings to get rid of the odr-use issue introduced
@@ -56,6 +57,20 @@ void SetDefaultOptions(std::map<std::string, std::string>& options) {
   std::string cache_so_name_opt(kNupharCacheSoName);
   std::string cache_so_name_default(kNupharCacheSoName_Default);
   options.insert(std::make_pair(cache_so_name_opt, cache_so_name_default));
+
+  std::string parallel_min_workloads_opt(kNupharParallelMinWorkloads);
+#if defined(USE_OPENMP) || defined(USE_MKLML)
+  // a rough estimate of workloads based on static dimensions for each thread, when using parallel schedule
+  // user may change it to 0 to turn it off,
+  // or use OMP_NUM_THREADS to control TVM thread pool similar to control MKL
+  unsigned int parallel_min_workloads_default = 64;
+#else
+  // turn off parallel schedule by default to avoid TVM thread pool confliction with others
+  // this is to ensure performance when user runs multiple inference threads, with each runs as single thread
+  // if needed, user can override it with settings, and use TVM_NUM_THREADS to control the thread pool
+  unsigned int parallel_min_workloads_default = 0;
+#endif
+  options.insert(std::make_pair(parallel_min_workloads_opt, std::to_string(parallel_min_workloads_default)));
 }
 
 void CreateNupharCodeGenSettings(const NupharExecutionProviderInfo& info) {
diff --git a/onnxruntime/core/providers/nuphar/common/nuphar_settings.h b/onnxruntime/core/providers/nuphar/common/nuphar_settings.h
index 8e836b7e84..5d2c149186 100644
--- a/onnxruntime/core/providers/nuphar/common/nuphar_settings.h
+++ b/onnxruntime/core/providers/nuphar/common/nuphar_settings.h
@@ -45,6 +45,9 @@ constexpr static const char* kNupharActivations_DeepCpu = "deep_cpu_activation";
 // Option to control nuphar code generation target (avx / avx2 / avx512)
 constexpr static const char* kNupharCodeGenTarget = "nuphar_codegen_target";
 
+// Option to control nuphar code to run with parallel schedule
+constexpr static const char* kNupharParallelMinWorkloads = "nuphar_parallel_min_workloads";
+
 // cache version number (MAJOR.MINOR.PATCH) following https://semver.org/
 // 1. MAJOR version when you make incompatible changes that old cache files no longer work,
 // 2. MINOR version when you add functionality in a backwards - compatible manner, and
diff --git a/onnxruntime/core/providers/nuphar/compiler/nuphar_handle.h b/onnxruntime/core/providers/nuphar/compiler/nuphar_handle.h
index 84be4555ba..1b5f91990d 100644
--- a/onnxruntime/core/providers/nuphar/compiler/nuphar_handle.h
+++ b/onnxruntime/core/providers/nuphar/compiler/nuphar_handle.h
@@ -28,7 +28,7 @@ struct NupharCodeGenHandle : codegen::CodeGenHandle {
   std::shared_ptr<tvm_codegen::TVMScheduleBuilder> schedule_builder;  // keep
   // maybe add a layout
   tvm_codegen::WeightLayoutRegistry* layout_registry;
-  bool enable_per_node_parallelized;  // TODO: change to config
+  int64_t parallel_min_workloads;
 
   bool allow_unaligned_buffers;  // move to another place
 
diff --git a/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc b/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc
index 3f54d50a56..47e7030551 100644
--- a/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc
@@ -9,6 +9,7 @@
 
 #include "core/providers/nuphar/common/analysis/subgraph_codegen_stats.h"
 #include "core/providers/nuphar/compiler/x86/x86_target_info.h"
+#include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h"
 
 // TODO change name space
 namespace onnxruntime {
@@ -37,9 +38,8 @@ static void Traverse(const tvm::Tensor& tensor,
   if (is_real_output) {
     CodeGenTargetX86* target = dynamic_cast<CodeGenTargetX86*>(ctx_codegen.GetCodeGenHandle()->codegen_target);
     ORT_ENFORCE(target != nullptr);
-    int64_t natural_vector_size = target->NaturalVectorWidth(tensor->dtype.bits());
 
-    TryVectorization(tensor, natural_vector_size, ctx_schedule);  // to x86
+    TryVectorizationX86(tensor, ctx_codegen, ctx_schedule);
     InsertRootScheduleAndClosure(tensor, ctx_schedule);
   }
 
diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h
index a41642e7e3..96e0fbc584 100644
--- a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h
@@ -39,5 +39,13 @@ bool InputRootScheduleWithVectorizationX86(
     tvm_codegen::CodeGenContext& ctx_codegen,
     tvm_codegen::ScheduleContext& ctx_sched);
 
+bool TryParallelX86(
+    const tvm::Tensor& tensor,
+    int64_t to_dim,  // fuse dims before to_dim for parallel schedule, 0 to fuse all but last dim
+    tvm_codegen::CodeGenContext& ctx_codegen,
+    tvm_codegen::ScheduleContext& ctx_sched);
+
+constexpr auto kNupharScheduleNoParallel = "nuphar_schedule_no_parallel";
+
 }  // namespace nuphar
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/ort_type_schedule.cc b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/ort_type_schedule.cc
index d32e35796a..2cc444dc18 100644
--- a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/ort_type_schedule.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/ort_type_schedule.cc
@@ -3,26 +3,99 @@
 
 #include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h"
 
+#include "core/codegen/passes/scheduler/schedule_utils.h"
+#include "core/framework/op_kernel_info.h"
+#include "core/providers/nuphar/common/nuphar_settings.h"
 #include "core/providers/nuphar/common/analysis/subgraph_codegen_stats.h"
 #include "core/providers/nuphar/compiler/nuphar_codegen_ctx.h"
-#include "core/codegen/passes/scheduler/schedule_utils.h"
 #include "core/providers/nuphar/compiler/x86/scheduler/tensorize/intrin_gemv_ll_extern.h"
 #include "core/providers/nuphar/compiler/x86/scheduler/tensorize/intrin_gemv_ll_ir.h"
 #include "core/providers/nuphar/compiler/x86/x86_target_info.h"
-#include "core/framework/op_kernel_info.h"
 #include <tvm/tvm.h>
+#include <tvm/ir_pass.h>
 
 namespace onnxruntime {
 namespace nuphar {
 
+bool TryParallelX86(
+    const tvm::Tensor& tensor,
+    int64_t to_dim,
+    tvm_codegen::CodeGenContext& ctx_codegen,
+    tvm_codegen::ScheduleContext& ctx_sched) {
+  auto compute_op = tensor->op.as<tvm::ComputeOpNode>();
+  if (compute_op == nullptr) {
+    return false;
+  }
+  if (compute_op->attrs.count(kNupharScheduleNoParallel)) {
+    return false;
+  }
+
+  const auto& shape = tensor->shape;
+
+  int rank = gsl::narrow<int>(shape.size());
+  tvm::Array<tvm::IterVar> to_fuse_for_parallel;
+  int64_t rank_to_parallel = (to_dim ? to_dim : rank - 1);
+  for (int64_t i = 0; i < rank_to_parallel && i < gsl::narrow<int64_t>(compute_op->axis.size()); ++i) {
+    tvm::IterVar axis = compute_op->axis[i];
+    auto dom = axis->dom;
+    if (!tvm::ir::Equal(dom->extent, shape[i])) {
+      // only do parallel schedule on axis not being fused or split yet
+      rank_to_parallel = i;
+      break;
+    }
+    to_fuse_for_parallel.push_back(axis);
+  }
+
+  if (to_fuse_for_parallel.size() < 1) {
+    return false;
+  }
+
+  int64_t per_thread_static_dims = 1;
+  for (const auto& reduce_axis : compute_op->reduce_axis) {
+    const int64_t* static_range = tvm::as_const_int(reduce_axis->dom->extent);
+    if (static_range != nullptr) {
+      per_thread_static_dims *= *static_range;
+    }
+  }
+  for (int64_t i = rank_to_parallel; i < rank; ++i) {
+    auto dim = tvm::as_const_int(shape[i]);
+    if (dim != nullptr) {
+      per_thread_static_dims *= *dim;
+    }
+  }
+
+  // skip small per thread workloads, note that symbolic dims are ignored (treated as 1)
+  int64_t workloads_threshold = Promote<NupharCodeGenCtx>(&ctx_codegen)->GetCodeGenHandle()->parallel_min_workloads;
+  if (workloads_threshold <= 0 || per_thread_static_dims < workloads_threshold) {
+    return false;
+  }
+
+  tvm::IterVar parallel_axis;
+  if (to_fuse_for_parallel.size() > 1) {
+    ctx_sched.schedule[tensor->op].fuse(to_fuse_for_parallel, &parallel_axis);
+  } else {
+    parallel_axis = to_fuse_for_parallel[0];
+  }
+  ctx_sched.schedule[tensor->op].parallel(parallel_axis);
+  return true;
+}
+
 bool TryVectorizationX86(
     const tvm::Tensor& tensor,
     tvm_codegen::CodeGenContext& ctx_codegen,
     tvm_codegen::ScheduleContext& ctx_sched) {
+  if (!ShouldTryVectorization(tensor, ctx_sched))
+    return false;
+
   CodeGenTargetX86* target = dynamic_cast<CodeGenTargetX86*>(ctx_codegen.GetCodeGenHandle()->codegen_target);
   ORT_ENFORCE(target != nullptr);
   int64_t natural_vector_size = target->NaturalVectorWidth(tensor->dtype.bits());
 
+  // try to use parallel schedule when vectorizing
+  // note that we don't do logic-or in return value here
+  // to make sure vectorization is always tried
+  TryParallelX86(tensor, 0, ctx_codegen, ctx_sched);
+
   return TryVectorization(tensor, natural_vector_size, ctx_sched);
 }
 
@@ -176,7 +249,7 @@ static Status ConvScheduleX86(const tvm::Tensor& tensor,
 
   ctx_sched.schedule[tensor->op].reorder({b, oc_chunk, y, xo, ic_chunk, m, n, ic_block, xi, oc_block});
 
-  if (ctx_codegen.GetCodeGenHandle()->enable_per_node_parallelized) {
+  if (ctx_codegen.GetCodeGenHandle()->parallel_min_workloads > 0) {
     tvm::Array<tvm::IterVar> fused_axis;
     fused_axis.push_back(b);
     fused_axis.push_back(oc_chunk);
@@ -186,6 +259,7 @@ static Status ConvScheduleX86(const tvm::Tensor& tensor,
     ctx_sched.schedule[tensor->op].fuse(fused_axis, &parallel_axis);
     ctx_sched.schedule[tensor->op].parallel(parallel_axis);
   }
+
   ctx_sched.schedule[tensor->op].vectorize(oc_block);
 
   return Status::OK();
@@ -243,7 +317,7 @@ static Status MatMul_2DWeight_Schedule(
   ctx_sched.schedule[CC->op].unroll(ki);
   ctx_sched.schedule[CC->op].vectorize(yc);
 
-  if (ctx_codegen.GetCodeGenHandle()->enable_per_node_parallelized) {
+  if (ctx_codegen.GetCodeGenHandle()->parallel_min_workloads > 0) {
     // parallelize
     tvm::Array<tvm::IterVar> fused_axis;
     for (size_t d = 0; d < C_rank - 2; ++d)
diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/tvm_rule_schedule.cc b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/tvm_rule_schedule.cc
index 646f22d7d7..d3f923f4e6 100644
--- a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/tvm_rule_schedule.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/tvm_rule_schedule.cc
@@ -22,6 +22,7 @@ bool TVM_SCHEDULER_CLASS(Extern, NupharX86TVMRule)::Evaluate(
 
 static bool ReduceVScheduleNupharX86(
     const tvm::Tensor& tensor,
+    tvm_codegen::CodeGenContext& ctx_codegen,
     tvm_codegen::ScheduleContext& ctx_sched) {
   InsertRootScheduleAndClosure(tensor, ctx_sched);
 
@@ -55,6 +56,8 @@ static bool ReduceVScheduleNupharX86(
     if (shape.size() > 0)
       head_dim = as_const_int(shape[0]);
 
+    bool try_parallel = true;
+
     // unroll packed reduce by checking head dim
     if (nullptr != head_dim) {
       // if head_dim is already fused, don't unroll
@@ -81,8 +84,13 @@ static bool ReduceVScheduleNupharX86(
 
         ctx_sched.schedule[tensor->op].reorder(reorder_axis);
         ctx_sched.schedule[tensor->op].unroll(x0);
+        try_parallel = false;
       }
     }
+
+    if (try_parallel) {
+      TryParallelX86(tensor, *fuse_dim, ctx_codegen, ctx_sched);
+    }
   } else if (compute_op->axis.size() > 0 &&
              tvm::as_const_int(tensor->shape[0]) != nullptr) {
     tvm::IterVar x = compute_op->axis[0];
@@ -101,7 +109,7 @@ static bool ReduceVScheduleNupharX86(
 bool TVM_SCHEDULER_CLASS(Reduce, NupharX86TVMRule)::Evaluate(
     const tvm::Tensor& tensor,
     const Node*,
-    tvm_codegen::CodeGenContext&,
+    tvm_codegen::CodeGenContext& ctx_codegen,
     tvm_codegen::ScheduleContext& ctx_sched) {
   // respect topi::kCommReduce
   if (tensor->op->tag == topi::kCommReduce) {
@@ -109,7 +117,7 @@ bool TVM_SCHEDULER_CLASS(Reduce, NupharX86TVMRule)::Evaluate(
   }
 
   if (tensor->op->tag == nuphar::kNupharVReduce) {
-    return ReduceVScheduleNupharX86(tensor, ctx_sched);
+    return ReduceVScheduleNupharX86(tensor, ctx_codegen, ctx_sched);
   }
 
   // unknown goes to InsertRootScheduleAndClosure
diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc
index e816c77ed9..2bcb6f0010 100644
--- a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc
+++ b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc
@@ -7,6 +7,7 @@
 #include "core/codegen/mti/mti_tvm_utils.h"
 #include "core/codegen/mti/tensor/pad_ops.h"
 #include "core/codegen/mti/tensor/reshape_ops.h"
+#include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h"
 #include <topi/reduction.h>
 
 namespace onnxruntime {
@@ -85,7 +86,7 @@ tvm::Tensor ReduceValueWithoutSplit(const tvm::Tensor& X,
 
   tvm::Map<std::string, tvm::NodeRef> attrs;
   attrs.Set(kNupharVReduceFuseDim, tvm::Expr(fuse_dim));
-
+  attrs.Set(kNupharScheduleNoParallel, tvm::Expr(true));
   return tvm::compute(output_shape, l_out, name + "_regular_reduce", kNupharVReduce, attrs);
 }
 
diff --git a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc
index b36d526828..d244c51d30 100644
--- a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc
+++ b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc
@@ -125,8 +125,8 @@ NupharExecutionProvider::NupharExecutionProvider(const NupharExecutionProviderIn
 
   handle->shape_inference = whole_graph_shape_infer_;
 
-  // TODO: remove
-  handle->enable_per_node_parallelized = info.enable_per_node_parallel;
+  handle->parallel_min_workloads = std::stoi(settings.GetOptionValue(kNupharParallelMinWorkloads));
+
   // TODO: remove
   handle->allow_unaligned_buffers = info.allow_unaligned_buffers;  // TODO remove this
 
diff --git a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.h b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.h
index 40c7821032..07ca11e4f0 100644
--- a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.h
+++ b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.h
@@ -31,9 +31,6 @@ constexpr const char* default_nuphar_target_str = stackvm_target_str;
 
 // Information needed to construct Nuphar execution providers.
 struct NupharExecutionProviderInfo {
-  // By default, let provider decide the target by passing in empty string.
-  bool enable_per_node_parallel;  // TODO: remove
-
   // this flag set TVM build_config with data_alignment=1, at the cost of performance
   bool allow_unaligned_buffers;
 
@@ -43,10 +40,8 @@ struct NupharExecutionProviderInfo {
   std::string settings;
 
   explicit NupharExecutionProviderInfo(bool unaligned_buffers,
-                                       const std::string& str_settings = "",
-                                       bool per_node_parallel = true)
-      : enable_per_node_parallel(per_node_parallel),
-        allow_unaligned_buffers(unaligned_buffers),
+                                       const std::string& str_settings = "")
+      : allow_unaligned_buffers(unaligned_buffers),
         settings(str_settings) {}
   NupharExecutionProviderInfo() = default;
 };
diff --git a/onnxruntime/core/providers/nuphar/nuphar_provider_factory.cc b/onnxruntime/core/providers/nuphar/nuphar_provider_factory.cc
index 4fdcbfa277..e5c2989e51 100644
--- a/onnxruntime/core/providers/nuphar/nuphar_provider_factory.cc
+++ b/onnxruntime/core/providers/nuphar/nuphar_provider_factory.cc
@@ -21,7 +21,7 @@ struct NupharExecutionProviderFactory : IExecutionProviderFactory {
 };
 
 std::unique_ptr<IExecutionProvider> NupharExecutionProviderFactory::CreateProvider() {
-  NupharExecutionProviderInfo info(allow_unaligned_buffers_, settings_, /*per_node_parallel*/ true);
+  NupharExecutionProviderInfo info(allow_unaligned_buffers_, settings_);
   return onnxruntime::make_unique<NupharExecutionProvider>(info);
 }