diff --git a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
index f7857a3cdc..03715eb5b7 100644
--- a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
+++ b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
@@ -24,9 +24,13 @@ enum COREMLFlags {
   // Please note, enable this option does not guarantee the entire model to be executed using ANE only
   COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE = 0x004,
 
-  // Keep COREML_FLAG_MAX at the end of the enum definition
+  // Only allow CoreML EP to take nodes with inputs with static shapes. By default it will also allow inputs with
+  // dynamic shapes. However, the performance may be negatively impacted if inputs have dynamic shapes.
+  COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES = 0x008,
+
+  // Keep COREML_FLAG_LAST at the end of the enum definition
   // And assign the last COREMLFlag to it
-  COREML_FLAG_LAST = COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE,
+  COREML_FLAG_LAST = COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES,
 };
 
 #ifdef __cplusplus
diff --git a/onnxruntime/core/providers/coreml/builders/coreml_spec.h b/onnxruntime/core/providers/coreml/builders/coreml_spec.h
new file mode 100644
index 0000000000..631bb7e258
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/builders/coreml_spec.h
@@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+// TODO come up with a more intuitive way of limiting this to Apple platform builds
+// E.g., putting CoreML EP files that should be enabled iff `defined(__APPLE__)` in a separate directory.
+#if !defined(__APPLE__)
+#error "This file should only be included when building on Apple platforms."
+#endif
+
+#include "coreml/Model.pb.h"
+
+namespace COREML_SPEC = CoreML::Specification;
diff --git a/onnxruntime/core/providers/coreml/builders/helper.cc b/onnxruntime/core/providers/coreml/builders/helper.cc
index d062d59de8..731018fa6b 100644
--- a/onnxruntime/core/providers/coreml/builders/helper.cc
+++ b/onnxruntime/core/providers/coreml/builders/helper.cc
@@ -14,65 +14,58 @@
 #include "core/graph/graph_viewer.h"
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/builders/op_builder.h"
+#include "core/providers/coreml/coreml_provider_factory.h"  // for COREMLFlags
 #include "core/providers/coreml/model/host_utils.h"
+#include "core/providers/coreml/shape_utils.h"
 
 namespace onnxruntime {
 namespace coreml {
 
-bool GetShape(const NodeArg& node_arg, std::vector<int64_t>& shape, const logging::Logger& logger) {
-  const auto* shape_proto = node_arg.Shape();
-  if (!shape_proto) {
-    LOGS(logger, WARNING) << "NodeArg [" << node_arg.Name() << "] has no shape info";
-    return false;
-  }
-
-  // We already checked the shape has no dynamic dimension
-  for (const auto& dim : shape_proto->dim()) {
-    shape.push_back(dim.dim_value());
-  }
-
-  return true;
+OpBuilderInputParams MakeOpBuilderParams(const GraphViewer& graph_viewer, uint32_t coreml_flags) {
+  return OpBuilderInputParams{graph_viewer,
+                              (coreml_flags & COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES) != 0};
 }
 
-bool IsNodeSupported(const Node& node, const GraphViewer& graph_viewer, const logging::Logger& logger) {
+bool IsNodeSupported(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger) {
   const auto& op_builders = GetOpBuilders();
   if (Contains(op_builders, node.OpType())) {
     const auto* op_builder = op_builders.at(node.OpType());
-    OpBuilderInputParams input_params(graph_viewer);
     return op_builder->IsOpSupported(node, input_params, logger);
   } else {
     return false;
   }
 }
 
-bool IsInputSupported(const NodeArg& input, const std::string& parent_name, const logging::Logger& logger) {
+bool IsInputSupported(const NodeArg& input, const std::string& parent_name,
+                      const OpBuilderInputParams& input_params, const logging::Logger& logger) {
   if (!input.Exists()) {
     // optional input that is not provided
     return true;
   }
 
   const auto& input_name = input.Name();
-  const auto* shape_proto = input.Shape();
+  std::vector<int64_t> shape;
   // We do not support input with no shape
-  if (!shape_proto) {
+  if (!GetShape(input, shape, logger)) {
     LOGS(logger, VERBOSE) << "Input [" << input_name << "] of [" << parent_name
-                          << "] has not shape";
+                          << "] has no shape";
     return false;
   }
 
-  for (const auto& dim : shape_proto->dim()) {
-    // For now we do not support dynamic shape
-    if (!dim.has_dim_value()) {
-      LOGS(logger, WARNING) << "Dynamic shape is not supported for now, for input:" << input_name;
-      return false;
-    }
+  if (input_params.only_allow_static_input_shapes && !IsStaticShape(shape)) {
+    LOGS(logger, VERBOSE) << "CoreML EP is set to only allow static input shapes. Input has a dynamic shape. Input: "
+                          << input_name << ", shape: " << Shape2String(shape);
+    return false;
+  }
 
+  for (const auto dim : shape) {
     // For some undocumented reason, Apple CoreML framework will fail loading the model if the model
     // input has dimension > 16384
     // See this issue, https://github.com/apple/coremltools/issues/1003
-    if (dim.dim_value() > 16384) {
+    if (dim > 16384) {
       LOGS(logger, WARNING) << "CoreML does not support input dim > 16384, input:" << input_name
-                            << ", actual dim: " << dim.dim_value();
+                            << ", actual dim: " << dim;
       return false;
     }
   }
@@ -81,6 +74,7 @@ bool IsInputSupported(const NodeArg& input, const std::string& parent_name, cons
 }
 
 std::unordered_set<const Node*> GetSupportedNodes(const GraphViewer& graph_viewer,
+                                                  const OpBuilderInputParams& input_params,
                                                   const logging::Logger& logger) {
   std::unordered_set<const Node*> supported_nodes{};
 
@@ -92,7 +86,7 @@ std::unordered_set<const Node*> GetSupportedNodes(const GraphViewer& graph_viewe
 #endif
 
   for (const auto& node : graph_viewer.Nodes()) {
-    const bool supported = IsNodeSupported(node, graph_viewer, logger);
+    const bool supported = IsNodeSupported(node, input_params, logger);
     LOGS(logger, VERBOSE) << "Operator type: [" << node.OpType()
                           << "] index: [" << node.Index()
                           << "] name: [" << node.Name()
diff --git a/onnxruntime/core/providers/coreml/builders/helper.h b/onnxruntime/core/providers/coreml/builders/helper.h
index 724117a715..194d022004 100644
--- a/onnxruntime/core/providers/coreml/builders/helper.h
+++ b/onnxruntime/core/providers/coreml/builders/helper.h
@@ -3,9 +3,10 @@
 
 #pragma once
 
-#include "core/common/status.h"
 #include "core/graph/basic_types.h"
 
+#include "core/providers/coreml/builders/op_builder.h"
+
 namespace onnxruntime {
 
 class GraphViewer;
@@ -18,14 +19,16 @@ class Logger;
 
 namespace coreml {
 
-bool GetShape(const NodeArg& node_arg, std::vector<int64_t>& shape, const logging::Logger& logger);
+OpBuilderInputParams MakeOpBuilderParams(const GraphViewer& graph_viewer, uint32_t coreml_flags);
 
-bool IsInputSupported(const NodeArg& node_arg, const std::string& parent_name, const logging::Logger& logger);
+bool IsInputSupported(const NodeArg& node_arg, const std::string& parent_name,
+                      const OpBuilderInputParams& input_params, const logging::Logger& logger);
 
-bool IsNodeSupported(const Node& node, const GraphViewer& graph_viewer, const logging::Logger& logger);
+bool IsNodeSupported(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger);
 
 // Gets the set of nodes that are supported by the CoreML EP.
 std::unordered_set<const Node*> GetSupportedNodes(const GraphViewer& graph_viewer,
+                                                  const OpBuilderInputParams& input_params,
                                                   const logging::Logger& logger);
 
 // CoreML is more efficient running using Apple Neural Engine
diff --git a/onnxruntime/core/providers/coreml/builders/impl/LRN_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/LRN_op_builder.cc
index 3ace8e1fc3..53f18b2058 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/LRN_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/LRN_op_builder.cc
@@ -1,14 +1,15 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/shared/utils/utils.h"
+
 #ifdef __APPLE__
 #include "core/providers/coreml/builders/model_builder.h"
 #endif
-#include "core/providers/coreml/builders/op_builder_factory.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace coreml {
@@ -17,8 +18,8 @@ class LRNOpBuilder : public BaseOpBuilder {
   // Add operator related
 #ifdef __APPLE__
  private:
-  [[nodiscard]] Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const override;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
 #endif
 
   // Operator support related
diff --git a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
index 8590339257..88d6616b4e 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
@@ -1,18 +1,20 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/common/narrow.h"
+#include "core/optimizer/initializer.h"
+#include "core/providers/common.h"
+#include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/shared/utils/utils.h"
+
 #ifdef __APPLE__
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #endif
-#include "core/common/narrow.h"
-#include "core/providers/common.h"
-#include "core/providers/coreml/builders/helper.h"
-#include "core/providers/coreml/builders/impl/base_op_builder.h"
-#include "core/providers/coreml/builders/op_builder_factory.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/optimizer/initializer.h"
 
 namespace onnxruntime {
 namespace coreml {
@@ -24,8 +26,8 @@ class ActivationOpBuilder : public BaseOpBuilder {
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
  private:
-  [[nodiscard]] Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const override;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
 #endif
 
   // Operator support related
@@ -135,6 +137,12 @@ bool IsPReluOpSupported(const Node& node, const OpBuilderInputParams& input_para
     return false;
   }
 
+  // ensure that the third from last dimension is not dynamic
+  if (x_shape[x_rank - 3] == -1) {
+    LOGS(logger, VERBOSE) << "PRelu 'X' input must have a known third from last dimension.";
+    return false;
+  }
+
   // slope input must be a constant initializer
   if (!input_params.graph_viewer.IsConstantInitializer(input_defs[1]->Name(), true)) {
     LOGS(logger, VERBOSE) << "PRelu 'slope' input must be a constant initializer tensor";
@@ -146,7 +154,7 @@ bool IsPReluOpSupported(const Node& node, const OpBuilderInputParams& input_para
   // - have 1 element
   {
     std::vector<int64_t> slope_shape;
-    if (!GetShape(*input_defs[1], slope_shape, logger)) {
+    if (!GetStaticShape(*input_defs[1], slope_shape, logger)) {
       return false;
     }
     const bool has_per_channel_slopes =
diff --git a/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc
index 6eb1bf1da0..7a5d4a5af6 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc
@@ -16,8 +16,8 @@ class ArgMaxOpBuilder : public BaseOpBuilder {
   // Add operator related
  private:
 #ifdef __APPLE__
-  [[nodiscard]] Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const override;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
 #endif
 
   // Operator support related
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
index cc883a70dc..25d5bad14c 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -1,15 +1,15 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <core/providers/common.h>
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+
+#include "core/providers/common.h"
+#include "core/providers/coreml/builders/helper.h"
+#include "core/providers/shared/utils/utils.h"
 
 #ifdef __APPLE__
 #include "core/providers/coreml/builders/model_builder.h"
 #endif
-#include "core/providers/coreml/builders/helper.h"
-#include "core/providers/shared/utils/utils.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace coreml {
@@ -41,8 +41,8 @@ bool HasExternalInitializer(const InitializedTensorSet& initializers, const Node
 // Add operator related
 #ifdef __APPLE__
 Status BaseOpBuilder::AddToModelBuilder(ModelBuilder& model_builder, const Node& node,
+                                        const OpBuilderInputParams& input_params,
                                         const logging::Logger& logger) const {
-  OpBuilderInputParams input_params(model_builder.GetGraphViewer());
   ORT_RETURN_IF_NOT(
       IsOpSupported(node, input_params, logger),
       "Unsupported operator ",
@@ -77,7 +77,7 @@ BaseOpBuilder::CreateNNLayer(const std::string& layer_name) {
 
 bool BaseOpBuilder::IsOpSupported(const Node& node, const OpBuilderInputParams& input_params,
                                   const logging::Logger& logger) const {
-  if (!HasSupportedInputs(node, logger))
+  if (!HasSupportedInputs(node, input_params, logger))
     return false;
 
   // We do not support external initializers for now
@@ -91,10 +91,11 @@ bool BaseOpBuilder::IsOpSupported(const Node& node, const OpBuilderInputParams&
   return IsOpSupportedImpl(node, input_params, logger);
 }
 
-bool BaseOpBuilder::HasSupportedInputs(const Node& node, const logging::Logger& logger) const {
+bool BaseOpBuilder::HasSupportedInputs(const Node& node, const OpBuilderInputParams& input_params,
+                                       const logging::Logger& logger) const {
   const auto node_name = MakeString("Node [", node.Name(), "] type [", node.OpType(), "]");
   for (const auto* input : node.InputDefs()) {
-    if (!IsInputSupported(*input, node_name, logger)) {
+    if (!IsInputSupported(*input, node_name, input_params, logger)) {
       return false;
     }
   }
@@ -111,13 +112,6 @@ bool BaseOpBuilder::HasSupportedInputsImpl(const Node& node, const logging::Logg
   if (!GetType(input, input_type, logger))
     return false;
 
-  if (node.OpType() == "Cast" && input_type == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
-    LOGS(logger, VERBOSE) << "[" << node.OpType()
-                          << "] Input type: [" << input_type
-                          << "] is not actually supported (used for supporting argmax op).";
-    return true;
-  }
-
   if (input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
     LOGS(logger, VERBOSE) << "[" << node.OpType()
                           << "] Input type: [" << input_type
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
index c3f8d5b983..b142db86a7 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
@@ -5,6 +5,10 @@
 
 #include "core/providers/coreml/builders/op_builder.h"
 
+#ifdef __APPLE__
+#include "core/providers/coreml/builders/coreml_spec.h"
+#endif
+
 namespace onnxruntime {
 namespace coreml {
 
@@ -19,12 +23,13 @@ class BaseOpBuilder : public IOpBuilder {
 #ifdef __APPLE__
  public:
   virtual void AddInitializersToSkip(ModelBuilder& /* model_builder */, const Node& /* node */) const override {}
-  [[nodiscard]] Status AddToModelBuilder(ModelBuilder& model_builder, const Node& node,
-                                         const logging::Logger& logger) const override final;
+  Status AddToModelBuilder(ModelBuilder& model_builder, const Node& node,
+                           const OpBuilderInputParams& input_params,
+                           const logging::Logger& logger) const override final;
 
  protected:
-  [[nodiscard]] virtual Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                                     const logging::Logger& logger) const = 0;
+  virtual Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                                       const logging::Logger& logger) const = 0;
 
   static std::unique_ptr<COREML_SPEC::NeuralNetworkLayer>
   CreateNNLayer(ModelBuilder& model_builder, const Node& node);
@@ -35,7 +40,7 @@ class BaseOpBuilder : public IOpBuilder {
   // Operator support related
  public:
   bool IsOpSupported(const Node& node, const OpBuilderInputParams& input_params,
-                     const logging::Logger& logger) const override;
+                     const logging::Logger& logger) const override final;
 
  protected:
   virtual bool IsOpSupportedImpl(const Node& /* node */, const OpBuilderInputParams& /* input_params */,
@@ -50,7 +55,8 @@ class BaseOpBuilder : public IOpBuilder {
 
  private:
   bool HasSupportedOpSet(const Node& node, const logging::Logger& logger) const;
-  bool HasSupportedInputs(const Node& node, const logging::Logger& logger) const;
+  bool HasSupportedInputs(const Node& node, const OpBuilderInputParams& input_params,
+                          const logging::Logger& logger) const;
 };
 
 }  // namespace coreml
diff --git a/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
index 8a7dcc217b..391b02eaec 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
@@ -2,15 +2,16 @@
 // Licensed under the MIT License.
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/shared/utils/utils.h"
+
 #ifdef __APPLE__
 #include "core/providers/coreml/builders/model_builder.h"
 #endif
-#include "core/providers/coreml/builders/op_builder_factory.h"
-
-#include "base_op_builder.h"
-#include "builder_utils.h"
 
 namespace onnxruntime {
 namespace coreml {
@@ -22,8 +23,8 @@ class BatchNormalizationOpBuilder : public BaseOpBuilder {
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
  private:
-  [[nodiscard]] Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const override;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
 #endif
 
   // Operator support related
diff --git a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
index 3ec82d4a9b..10c9b32d03 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
@@ -6,6 +6,7 @@
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/shared/utils/utils.h"
 #ifdef __APPLE__
+#include "core/framework/tensorprotoutils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #endif
 
@@ -18,8 +19,8 @@ class BinaryOpBuilder : public BaseOpBuilder {
   // Add operator related
  private:
 #ifdef __APPLE__
-  [[nodiscard]] Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const override;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
 #endif
   // Operator support related
   int GetMinSupportedOpSet(const Node& node) const override;
@@ -30,15 +31,31 @@ class BinaryOpBuilder : public BaseOpBuilder {
 #ifdef __APPLE__
 static bool CheckIfBothInputShapesMatch(const Node& node, const logging::Logger& logger) {
   const auto& input_defs = node.InputDefs();
-  std::vector<int64_t> input_shape1;
-  if (!GetShape(*input_defs[0], input_shape1, logger))
-    return false;
 
-  std::vector<int64_t> input_shape2;
-  if (!GetShape(*input_defs[1], input_shape2, logger))
-    return false;
+  const auto* x_shape_proto = input_defs[0]->Shape();
+  const auto* y_shape_proto = input_defs[1]->Shape();
 
-  return input_shape1 == input_shape2;
+  if (!x_shape_proto || !y_shape_proto) {
+    LOGS(logger, WARNING) << "[" << node.Name() << "] Input shape is missing";
+    return false;
+  }
+
+  using Dimension = ONNX_NAMESPACE::TensorShapeProto::Dimension;
+  auto dim_eq =
+      [](const Dimension& x_dim, const Dimension& y_dim) {
+        const bool x_has_dim_value = utils::HasDimValue(x_dim);
+        if (x_has_dim_value != utils::HasDimValue(y_dim)) {
+          return false;
+        }
+        if (x_has_dim_value) {
+          return x_dim.dim_value() == y_dim.dim_value();
+        }
+        return x_dim.dim_param() == y_dim.dim_param();
+      };
+
+  return std::equal(x_shape_proto->dim().begin(), x_shape_proto->dim().end(),
+                    y_shape_proto->dim().begin(), y_shape_proto->dim().end(),
+                    dim_eq);
 }
 
 // Add operator related
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
index d14c6ff041..e68104f104 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
@@ -16,14 +16,14 @@
 namespace onnxruntime {
 namespace coreml {
 
-common::Status ComputeConvPads(const std::vector<int64_t> input_shape,
-                               const int64_t weight_size_y,
-                               const int64_t weight_size_x,
-                               const std::vector<int64_t>& onnx_pads,
-                               const std::vector<int64_t>& onnx_strides,
-                               const std::vector<int64_t>& onnx_dilations,
-                               AutoPadType auto_pad_type,
-                               std::vector<int64_t>& pads_out) {
+Status ComputeConvPads(const std::vector<int64_t> input_shape,
+                       const int64_t weight_size_y,
+                       const int64_t weight_size_x,
+                       const std::vector<int64_t>& onnx_pads,
+                       const std::vector<int64_t>& onnx_strides,
+                       const std::vector<int64_t>& onnx_dilations,
+                       AutoPadType auto_pad_type,
+                       std::vector<int64_t>& pads_out) {
   const int64_t input_size_y = input_shape[2];
   const int64_t input_size_x = input_shape[3];
   const int64_t stride_y = onnx_strides[0];
@@ -50,16 +50,18 @@ common::Status ComputeConvPads(const std::vector<int64_t> input_shape,
   return Status::OK();
 }
 
-common::Status HandleAutoPad(const std::vector<int64_t> input_shape,
-                             const int64_t weight_size_y,
-                             const int64_t weight_size_x,
-                             const std::vector<int64_t>& onnx_pads,
-                             const std::vector<int64_t>& onnx_strides,
-                             const std::vector<int64_t>& onnx_dilations,
-                             AutoPadType auto_pad_type,
-                             AutoPadType& auto_pad_type_out) {
+Status HandleAutoPad(const std::vector<int64_t> input_shape,
+                     const int64_t weight_size_y,
+                     const int64_t weight_size_x,
+                     const std::vector<int64_t>& onnx_pads,
+                     const std::vector<int64_t>& onnx_strides,
+                     const std::vector<int64_t>& onnx_dilations,
+                     AutoPadType auto_pad_type,
+                     AutoPadType& auto_pad_type_out) {
   auto_pad_type_out = auto_pad_type;
-  if (auto_pad_type == AutoPadType::NOTSET && onnx_dilations == std::vector<int64_t>{1, 1}) {
+  if (auto_pad_type == AutoPadType::NOTSET && onnx_dilations == std::vector<int64_t>{1, 1} &&
+      // ComputeConvPads() only handles known dimensions of input_shape[2] and input_shape[3]
+      input_shape[2] != -1 && input_shape[3] != -1) {
     {
       std::vector<int64_t> same_upper_pads;
       ORT_RETURN_IF_ERROR(ComputeConvPads(input_shape, weight_size_y, weight_size_x,
@@ -91,8 +93,8 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight,
   *weight.mutable_floatvalue() = {data, data + num_elements};
 }
 
-common::Status CreateCoreMLWeight(CoreML::Specification::WeightParams& weight,
-                                  const ONNX_NAMESPACE::TensorProto& tensor) {
+Status CreateCoreMLWeight(CoreML::Specification::WeightParams& weight,
+                          const ONNX_NAMESPACE::TensorProto& tensor) {
   auto data_type = tensor.data_type();
   if (data_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
     Initializer unpacked_tensor(tensor);
@@ -105,7 +107,7 @@ common::Status CreateCoreMLWeight(CoreML::Specification::WeightParams& weight,
                            tensor.name(), " type: ", data_type);
   }
 
-  return common::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace coreml
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
index a2caa7415b..2cfe5f00e5 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
@@ -23,18 +23,18 @@ namespace coreml {
 
 // Try to see if we can map explicit padding to auto padding for Conv/Pool
 // Since usually use auto padding is more efficient
-[[nodiscard]] common::Status HandleAutoPad(const std::vector<int64_t> input_shape,
-                                           const int64_t weight_size_y,
-                                           const int64_t weight_size_x,
-                                           const std::vector<int64_t>& onnx_pads,
-                                           const std::vector<int64_t>& onnx_strides,
-                                           const std::vector<int64_t>& onnx_dilations,
-                                           AutoPadType auto_pad_type,
-                                           AutoPadType& auto_pad_type_out);
+Status HandleAutoPad(const std::vector<int64_t> input_shape,
+                     const int64_t weight_size_y,
+                     const int64_t weight_size_x,
+                     const std::vector<int64_t>& onnx_pads,
+                     const std::vector<int64_t>& onnx_strides,
+                     const std::vector<int64_t>& onnx_dilations,
+                     AutoPadType auto_pad_type,
+                     AutoPadType& auto_pad_type_out);
 
 // Copy an onnx initializer data to a coreml weight
-common::Status CreateCoreMLWeight(CoreML::Specification::WeightParams& weight,
-                                  const ONNX_NAMESPACE::TensorProto& tensor);
+Status CreateCoreMLWeight(CoreML::Specification::WeightParams& weight,
+                          const ONNX_NAMESPACE::TensorProto& tensor);
 
 // Copy the float array to a coreml weight
 void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight,
diff --git a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
index 4363b80ece..15ee1f0fc7 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
@@ -17,12 +17,13 @@ class CastOpBuilder : public BaseOpBuilder {
   // Add operator related
  private:
 #ifdef __APPLE__
-  [[nodiscard]] Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const override;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
 #endif
   // Operator support related
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
+  bool HasSupportedInputsImpl(const Node& node, const logging::Logger& logger) const override;
 };
 
 // Add operator related
@@ -63,7 +64,7 @@ bool CastOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
                           << "]";
     return false;
   }
-  if (!IsNodeSupported(prec_node, input_params.graph_viewer, logger)) {
+  if (!IsNodeSupported(prec_node, input_params, logger)) {
     LOGS(logger, VERBOSE) << "Cast's producing node ["
                           << prec_node.OpType()
                           << "] is not a supported op.";
@@ -83,6 +84,25 @@ bool CastOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
   return true;
 }
 
+bool CastOpBuilder::HasSupportedInputsImpl(const Node& node, const logging::Logger& logger) const {
+  // We only check the type of input 0
+  const auto& input = *node.InputDefs()[0];
+
+  int32_t input_type;
+  if (!GetType(input, input_type, logger))
+    return false;
+
+  // only support int64 coming from ArgMax (check for ArgMax is done in IsOpSupportedImpl())
+  if (input_type != ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+    LOGS(logger, VERBOSE) << "[" << node.OpType()
+                          << "] Input type: [" << input_type
+                          << "] is not supported.";
+    return false;
+  }
+
+  return true;
+}
+
 void CreateCastOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
   op_registrations.builders.push_back(std::make_unique<CastOpBuilder>());
   op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
diff --git a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
index 07a8a236a9..3a3f89d24c 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
@@ -19,8 +19,8 @@ class ClipOpBuilder : public BaseOpBuilder {
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
  private:
-  [[nodiscard]] Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const override;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
 #endif
 
   // Operator support related
diff --git a/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
index cc298a4f20..b1e761024f 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
@@ -2,15 +2,16 @@
 // Licensed under the MIT License.
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/shared/utils/utils.h"
+
 #ifdef __APPLE__
 #include "core/providers/coreml/builders/model_builder.h"
 #endif
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace coreml {
 
@@ -18,8 +19,8 @@ class ConcatOpBuilder : public BaseOpBuilder {
   // Add operator related
  private:
 #ifdef __APPLE__
-  [[nodiscard]] Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const override;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
 #endif
 
   // Operator support related
diff --git a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
index 757f69f4d9..ff9dcbd9f8 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
@@ -2,15 +2,16 @@
 // Licensed under the MIT License.
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
-#ifdef __APPLE__
-#include "core/providers/coreml/builders/model_builder.h"
-#include "builder_utils.h"
-#endif
+#include "core/providers/shared/utils/utils.h"
 
-#include "base_op_builder.h"
+#ifdef __APPLE__
+#include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/builders/model_builder.h"
+#include "core/providers/coreml/shape_utils.h"
+#endif
 
 namespace onnxruntime {
 namespace coreml {
@@ -22,8 +23,8 @@ class ConvOpBuilder : public BaseOpBuilder {
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
  private:
-  [[nodiscard]] Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const override;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
 #endif
 
   // Operator support related
diff --git a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
index 6939107d23..a4ad1c31b5 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
@@ -1,16 +1,16 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <core/common/safeint.h>
-
-#include "core/providers/shared/utils/utils.h"
+#include "core/common/safeint.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/shared/utils/utils.h"
+
 #ifdef __APPLE__
 #include "core/providers/coreml/builders/model_builder.h"
 #endif
-#include "core/providers/coreml/builders/op_builder_factory.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace coreml {
@@ -19,8 +19,8 @@ class DepthToSpaceOpBuilder : public BaseOpBuilder {
   // Add operator related
  private:
 #ifdef __APPLE__
-  [[nodiscard]] Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const override;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
 #endif
 
   // Operator support related
diff --git a/onnxruntime/core/providers/coreml/builders/impl/flatten_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/flatten_op_builder.cc
index 4b6da84ff5..b303fe7884 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/flatten_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/flatten_op_builder.cc
@@ -1,15 +1,15 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/shared/utils/utils.h"
 
 #ifdef __APPLE__
 #include "core/providers/coreml/builders/model_builder.h"
 #endif
-#include "core/providers/coreml/builders/op_builder_factory.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace coreml {
@@ -18,8 +18,8 @@ class FlattenOpBuilder : public BaseOpBuilder {
   // Add operator related
 #ifdef __APPLE__
  private:
-  [[nodiscard]] Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const override;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
 #endif
 
   // Operator support related
diff --git a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
index 4e3944d824..405ca9542b 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
@@ -1,21 +1,21 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <core/common/safeint.h>
-#include <core/framework/tensorprotoutils.h>
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/providers/coreml/builders/helper.h"
-#include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/common/safeint.h"
+#include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
+#include "core/providers/common.h"
+#include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/shared/utils/utils.h"
 
 #ifdef __APPLE__
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
-#include "builder_utils.h"
 #endif
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace coreml {
 
@@ -26,8 +26,8 @@ class GemmOpBuilder : public BaseOpBuilder {
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
  private:
-  [[nodiscard]] Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const override;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
 #endif
 
   // Operator support related
@@ -146,6 +146,7 @@ bool GemmOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
       return false;
     }
 
+    // TODO is it ok if the shape is dynamic and empty?
     if (Product(a_shape) == 0) {
       LOGS(logger, VERBOSE) << "A must be non-empty";
       return false;
diff --git a/onnxruntime/core/providers/coreml/builders/impl/pad_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/pad_op_builder.cc
index 603ed6e20f..ba12600e8b 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/pad_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/pad_op_builder.cc
@@ -1,19 +1,19 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/framework/tensorprotoutils.h"
 #include "core/framework/tensor_shape.h"
-#include "core/providers/coreml/builders/helper.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
+#include "core/providers/common.h"
+#include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/shared/utils/utils.h"
 
 #ifdef __APPLE__
 #include "core/providers/coreml/builders/model_builder.h"
 #endif
-#include "core/providers/coreml/builders/op_builder_factory.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace coreml {
@@ -141,6 +141,7 @@ bool PadOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParam
     return false;
   }
 
+  // TODO is it ok if the shape is dynamic and empty?
   const TensorShape shape(input_shape);
   if (shape.Size() == 0) {
     LOGS(logger, VERBOSE) << "Cases that input data being empty due to a dimension with value of 0 is not supported";
diff --git a/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
index ce72a83cc8..fd1c77c851 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
@@ -2,15 +2,16 @@
 // Licensed under the MIT License.
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
-#ifdef __APPLE__
-#include "core/providers/coreml/builders/model_builder.h"
-#include "builder_utils.h"
-#endif
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/shared/utils/utils.h"
 
-#include "base_op_builder.h"
+#ifdef __APPLE__
+#include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/builders/model_builder.h"
+#endif
 
 namespace onnxruntime {
 namespace coreml {
@@ -19,8 +20,8 @@ class PoolOpBuilder : public BaseOpBuilder {
   // Add operator related
  private:
 #ifdef __APPLE__
-  [[nodiscard]] Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const override;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
 #endif
 
   // Operator support related
diff --git a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
index 4d5102bbb2..9df0dde024 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
@@ -1,19 +1,19 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
 #include "core/framework/tensorprotoutils.h"
-#include "core/providers/cpu/tensor/reshape_helper.h"
 #include "core/optimizer/initializer.h"
-
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/cpu/tensor/reshape_helper.h"
+#include "core/providers/shared/utils/utils.h"
+
 #ifdef __APPLE__
 #include "core/providers/coreml/builders/model_builder.h"
 #endif
-#include "core/providers/coreml/builders/op_builder_factory.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace coreml {
@@ -25,8 +25,8 @@ class ReshapeOpBuilder : public BaseOpBuilder {
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
  private:
-  [[nodiscard]] Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const override;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
 #endif
 
   // Operator support related
@@ -60,7 +60,7 @@ Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   const auto size = target_shape_tensor.dims()[0];
   TensorShapeVector target_shape{raw_target_shape, raw_target_shape + size};
   std::vector<int64_t> input_shape;
-  ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
+  ORT_RETURN_IF_NOT(GetStaticShape(*input_defs[0], input_shape, logger), "Cannot get shape");
   ReshapeHelper helper(TensorShape(input_shape), target_shape);
   *layer->mutable_reshapestatic()->mutable_targetshape() = {target_shape.cbegin(), target_shape.cend()};
   *layer->mutable_input()->Add() = input_defs[0]->Name();
@@ -93,7 +93,7 @@ bool ReshapeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputP
   }
 
   std::vector<int64_t> input_shape;
-  if (!GetShape(*input_defs[0], input_shape, logger))
+  if (!GetStaticShape(*input_defs[0], input_shape, logger))
     return false;
 
   if (input_shape.empty()) {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
index ace70dd383..5f963dc30d 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
@@ -3,19 +3,19 @@
 
 #include <math.h>
 
-#include "core/providers/common.h"
 #include "core/framework/tensorprotoutils.h"
+#include "core/optimizer/initializer.h"
+#include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
 #include "core/providers/cpu/tensor/reshape_helper.h"
 #include "core/providers/shared/utils/utils.h"
-#include "core/optimizer/initializer.h"
 
 #ifdef __APPLE__
 #include "core/providers/coreml/builders/model_builder.h"
 #endif
-#include "core/providers/coreml/builders/op_builder_factory.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace coreml {
@@ -27,8 +27,8 @@ class ResizeOpBuilder : public BaseOpBuilder {
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
  private:
-  [[nodiscard]] Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const override;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
 #endif
 
   // Operator support related
@@ -70,7 +70,7 @@ bool GetResizeOutputSizes(const InitializedTensorSet& initializers,
     return false;
   Initializer unpacked_tensor(sizes_tensor);
   auto sizes_data = unpacked_tensor.DataAsSpan<int64_t>();
-  sizes = std::vector<int64_t>{sizes_data.begin(), sizes_data.end()};
+  sizes = std::vector<int64_t>(sizes_data.begin(), sizes_data.end());
   return true;
 }
 
@@ -117,7 +117,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     coreml_upsample->add_scalingfactor(static_cast<int64_t>(scales[3]));
   } else {  // we already checked number of inputs in IsOpSupportedImpl
     std::vector<int64_t> input_shape;
-    ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Error getting input shape");
+    ORT_RETURN_IF_NOT(GetStaticShape(*input_defs[0], input_shape, logger), "Error getting input shape");
     std::vector<int64_t> output_sizes;
     ORT_RETURN_IF_NOT(GetResizeOutputSizes(initializers, node, output_sizes, logger),
                       "Error getting resize output_sizes");
@@ -245,10 +245,15 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPa
       if (!GetResizeOutputSizes(initializers, node, output_sizes, logger))
         return false;
 
+      if (!IsStaticShape(input_shape)) {
+        LOGS(logger, VERBOSE) << "Input shape with dynamic dimensions is not supported.";
+        return false;
+      }
+
       auto output_size_n = output_sizes[0];
       auto output_size_c = output_sizes[1];
       if (output_size_n != input_shape[0] || output_size_c != input_shape[1]) {
-        LOGS(logger, VERBOSE) << "Output sizes of N/C chanel should match the input sizes, "
+        LOGS(logger, VERBOSE) << "Output sizes of N/C channel should match the input sizes, "
                               << "Resize of N/C channels are not supported"
                               << ", input_size_n, " << input_shape[0] << ", output_size_n, " << output_size_n
                               << ". input_size_c, " << input_shape[1] << ", output_size_c, " << output_size_c;
diff --git a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
index d1f5878c26..2e14c85ce6 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
@@ -23,8 +23,8 @@ class SqueezeOpBuilder : public BaseOpBuilder {
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
  private:
-  [[nodiscard]] Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const override;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
 #endif
 
   // Operator support related
diff --git a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
index 5306d9f11d..7d5018a19f 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
@@ -1,14 +1,15 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/shared/utils/utils.h"
+
 #ifdef __APPLE__
 #include "core/providers/coreml/builders/model_builder.h"
 #endif
-#include "core/providers/coreml/builders/op_builder_factory.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace coreml {
@@ -17,8 +18,8 @@ class TransposeOpBuilder : public BaseOpBuilder {
   // Add operator related
 #ifdef __APPLE__
  private:
-  [[nodiscard]] Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                             const logging::Logger& logger) const override;
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
 #endif
 };
 
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc
index 6ca39ec301..98edaaa4b6 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc
@@ -9,9 +9,10 @@
 #include "op_builder_factory.h"
 
 #include "core/providers/common.h"
-#include "core/providers/coreml/model/model.h"
-#include "core/providers/coreml/model/host_utils.h"
 #include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/model/host_utils.h"
+#include "core/providers/coreml/model/model.h"
+#include "core/providers/coreml/shape_utils.h"
 
 namespace onnxruntime {
 namespace coreml {
@@ -60,12 +61,7 @@ void ModelBuilder::PreprocessInitializers() {
     // find all initializers consumed. AddInitializersToSkip will potentially decrement the usage count.
     for (const auto* input : node.InputDefs()) {
       if (input->Exists() && Contains(initializers, input->Name())) {
-        auto entry = initializer_usage_.find(input->Name());
-        if (entry == initializer_usage_.end()) {
-          initializer_usage_[input->Name()] = 1;
-        } else {
-          entry->second++;
-        }
+        initializer_usage_[input->Name()]++;
       }
     }
     if (const auto* op_builder = GetOpBuilder(node)) {
@@ -128,33 +124,43 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
 
   input_output.set_name(name);
   auto* multi_array = input_output.mutable_type()->mutable_multiarraytype();
+
   std::vector<int64_t> shape;
+  ORT_RETURN_IF_NOT(GetShape(node_arg, shape, logger_),
+                    "Unable to get shape for ", input_output_type, ": ", name);
 
-  {  // input_output shape
-    const auto* shape_proto = node_arg.Shape();
-    ORT_RETURN_IF(shape_proto == nullptr,
-                  "shape_proto cannot be null for ", input_output_type, ": ", name);
-    const auto& dims = shape_proto->dim();
-    if (dims.empty()) {
-      // If we have an empty shape, this is a scalar input,
-      // Since all the input output of CoreML EP is MultiArray, we will make the scalar input output as a {1} MultiArray
-      shape.push_back(1);
+  if (shape.empty()) {
+    // If we have an empty shape, this is a scalar input,
+    // Since all the input output of CoreML EP is MultiArray, we will make the scalar input output as a {1} MultiArray
+    shape.push_back(1);
 
-      // we need to change the shapes of these scalar outputs back to {} when CoreML EP returns these values to ORT
-      if (!is_input) {
-        AddScalarOutput(name);
-      }
-    } else {
-      shape.reserve(dims.size());
-      for (const auto& dim : dims) {
-        ORT_RETURN_IF_NOT(dim.has_dim_value(),
-                          "Dynamic shape is not supported yet, for ", input_output_type, ": ", name);
-        shape.push_back(dim.dim_value());
-      }
+    // we need to change the shapes of these scalar outputs back to {} when CoreML EP returns these values to ORT
+    if (!is_input) {
+      AddScalarOutput(name);
     }
   }
 
-  *multi_array->mutable_shape() = {shape.cbegin(), shape.cend()};
+  if (IsStaticShape(shape)) {
+    *multi_array->mutable_shape() = {shape.cbegin(), shape.cend()};
+  } else {
+    auto& multi_array_shape_range = *multi_array->mutable_shaperange();
+    auto& multi_array_shape = *multi_array->mutable_shape();
+
+    for (const auto dim : shape) {
+      auto& multi_array_dim_size_range = *multi_array_shape_range.mutable_sizeranges()->Add();
+      if (dim == -1) {
+        multi_array_dim_size_range.set_lowerbound(0);
+        multi_array_dim_size_range.set_upperbound(-1);  // unbounded
+
+        multi_array_shape.Add(1);  // pick 1 as an arbitrary default dynamic dimension value
+      } else {
+        multi_array_dim_size_range.set_lowerbound(dim);
+        multi_array_dim_size_range.set_upperbound(dim);
+
+        multi_array_shape.Add(dim);
+      }
+    }
+  }
 
   int32_t data_type;
   {  // type
@@ -204,11 +210,12 @@ Status ModelBuilder::RegisterModelInputs() {
 }
 
 Status ModelBuilder::AddOperations() {
+  const auto builder_params = MakeOpBuilderParams(graph_viewer_, coreml_flags_);
   const auto& node_indices = graph_viewer_.GetNodesInTopologicalOrder();
   for (size_t i = 0; i < node_indices.size(); i++) {
     const auto* node(graph_viewer_.GetNode(node_indices[i]));
     if (const auto* op_builder = GetOpBuilder(*node)) {
-      ORT_RETURN_IF_ERROR(op_builder->AddToModelBuilder(*this, *node, logger_));
+      ORT_RETURN_IF_ERROR(op_builder->AddToModelBuilder(*this, *node, builder_params, logger_));
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "Node [", node->Name(), "], type [", node->OpType(), "] is not supported");
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h
index 3dcbd27c13..af2d5437be 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.h
@@ -3,10 +3,8 @@
 
 #pragma once
 
-#include <core/graph/graph_viewer.h>
-#include "coreml/Model.pb.h"
-
-namespace COREML_SPEC = CoreML::Specification;
+#include "core/graph/graph_viewer.h"
+#include "core/providers/coreml/builders/coreml_spec.h"
 
 namespace onnxruntime {
 namespace coreml {
@@ -20,8 +18,8 @@ class ModelBuilder {
   ModelBuilder(const GraphViewer& graph_viewer, const logging::Logger& logger, uint32_t coreml_flags);
   ~ModelBuilder() = default;
 
-  [[nodiscard]] Status Compile(std::unique_ptr<Model>& model, const std::string& path);
-  [[nodiscard]] Status SaveCoreMLModel(const std::string& path);
+  Status Compile(std::unique_ptr<Model>& model, const std::string& path);
+  Status SaveCoreMLModel(const std::string& path);
 
   // Accessors for members
   const GraphViewer& GetGraphViewer() const { return graph_viewer_; }
@@ -55,18 +53,18 @@ class ModelBuilder {
   std::unordered_set<std::string> unique_names_;
 
   // Convert the onnx model to CoreML::Specification::Model
-  [[nodiscard]] Status Initialize();
+  Status Initialize();
 
   // If a CoreML operation will use initializers directly, we will add the initializers to the skip list
   void PreprocessInitializers();
 
   // Copy and process all the initializers to CoreML model
-  [[nodiscard]] Status RegisterInitializers();
+  Status RegisterInitializers();
 
-  [[nodiscard]] Status AddOperations();
-  [[nodiscard]] Status RegisterModelInputs();
-  [[nodiscard]] Status RegisterModelOutputs();
-  [[nodiscard]] Status RegisterModelInputOutput(const NodeArg& node_arg, bool is_input);
+  Status AddOperations();
+  Status RegisterModelInputs();
+  Status RegisterModelOutputs();
+  Status RegisterModelInputOutput(const NodeArg& node_arg, bool is_input);
 
   // Record the onnx scalar output names
   void AddScalarOutput(const std::string& output_name);
diff --git a/onnxruntime/core/providers/coreml/builders/op_builder.h b/onnxruntime/core/providers/coreml/builders/op_builder.h
index ee6e3f5848..79de6438c9 100644
--- a/onnxruntime/core/providers/coreml/builders/op_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/op_builder.h
@@ -11,10 +11,12 @@ namespace coreml {
 class ModelBuilder;
 
 struct OpBuilderInputParams {
-  OpBuilderInputParams(const GraphViewer& graph_viewer)
-      : graph_viewer(graph_viewer) {}
+  OpBuilderInputParams(const GraphViewer& graph_viewer, bool only_allow_static_input_shapes)
+      : graph_viewer(graph_viewer),
+        only_allow_static_input_shapes(only_allow_static_input_shapes) {}
 
   const GraphViewer& graph_viewer;
+  const bool only_allow_static_input_shapes;
 };
 
 class IOpBuilder {
@@ -29,8 +31,9 @@ class IOpBuilder {
   virtual void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const = 0;
 
   // Add the operator to CoreML model
-  [[nodiscard]] virtual Status AddToModelBuilder(ModelBuilder& model_builder, const Node& node,
-                                                 const logging::Logger& logger) const = 0;
+  virtual Status AddToModelBuilder(ModelBuilder& model_builder, const Node& node,
+                                   const OpBuilderInputParams& input_params,
+                                   const logging::Logger& logger) const = 0;
 #endif
 
   // Operator support related
diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
index 37c578624c..84563a46b2 100644
--- a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
+++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
@@ -3,16 +3,20 @@
 
 #include "core/providers/coreml/coreml_execution_provider.h"
 
+#include <algorithm>
+
 #include "core/framework/compute_capability.h"
+#include "core/framework/tensorprotoutils.h"
 #include "core/graph/graph_viewer.h"
+#include "core/providers/coreml/builders/helper.h"
 #include "core/providers/partitioning_utils.h"
 #include "core/session/onnxruntime_cxx_api.h"
-#include "core/providers/coreml/builders/helper.h"
 
 #ifdef __APPLE__
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/model/host_utils.h"
 #include "core/providers/coreml/model/model.h"
+#include "core/providers/coreml/shape_utils.h"
 #endif
 
 namespace onnxruntime {
@@ -45,7 +49,8 @@ CoreMLExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie
     return result;
   }
 
-  const auto supported_nodes = coreml::GetSupportedNodes(graph_viewer, logger);
+  const auto builder_params = coreml::MakeOpBuilderParams(graph_viewer, coreml_flags_);
+  const auto supported_nodes = coreml::GetSupportedNodes(graph_viewer, builder_params, logger);
 
   const auto gen_metadef_name = [&]() {
     HashValue model_hash;
@@ -144,6 +149,17 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
         auto input_tensor = ctx.GetInput(i);
         auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
         auto shape = tensor_info.GetShape();
+
+        // Disallow inputs with dynamic shape which actually have zero elements.
+        // CoreML doesn't consistently handle this well (e.g., there may be runtime errors).
+        if (const auto* model_input_info = model->TryGetInputOutputInfo(input_name); model_input_info != nullptr) {
+          const auto& inferred_shape = model_input_info->shape;
+          ORT_RETURN_IF(!coreml::IsStaticShape(inferred_shape) && coreml::DoesShapeSpecifyZeroElements(shape),
+                        "Input (", input_name, ") has a dynamic shape (", coreml::Shape2String(inferred_shape),
+                        ") but the runtime shape (", coreml::Shape2String(shape),
+                        ") has zero elements. This is not supported by the CoreML EP.");
+        }
+
         // If we have an empty shape, this is a scalar input,
         // Since all the input output of CoreML EP is MultiArray, we will make the scalar input as a {1} MultiArray
         if (shape.empty())
@@ -165,8 +181,30 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
       // TODO, investigate concurrent runs for different executions from the same model
       {
         std::unique_lock<OrtMutex> lock(model->GetMutex());
-        std::unordered_map<std::string, coreml::OnnxTensorData> outputs;
+        std::unordered_map<std::string, coreml::OnnxTensorInfo> outputs;
         outputs.reserve(model_outputs.size());
+
+        coreml::GetOutputTensorMutableRawDataFn get_output_tensor_mutable_raw_data_fn =
+            [&ctx, &model_outputs](
+                const std::string& name,
+                int32_t requested_onnx_tensor_element_type,
+                gsl::span<const int64_t> static_shape) -> void* {
+          const auto model_output_it = std::find(model_outputs.begin(), model_outputs.end(), name);
+          ORT_ENFORCE(model_output_it != model_outputs.end(), "Failed to find CoreML model output name: ", name);
+          const auto output_idx = gsl::narrow_cast<size_t>(std::distance(model_outputs.begin(), model_output_it));
+
+          auto output_tensor = ctx.GetOutput(output_idx, static_shape.data(), static_shape.size());
+
+          const auto type_and_shape_info = output_tensor.GetTensorTypeAndShapeInfo();
+          const auto actual_element_type = type_and_shape_info.GetElementType();
+          ORT_ENFORCE(utils::CApiElementTypeFromProtoType(requested_onnx_tensor_element_type) == actual_element_type,
+                      "Requested and actual output tensor element types do not match. Requested: ",
+                      utils::CApiElementTypeFromProtoType(requested_onnx_tensor_element_type),
+                      ", actual: ", actual_element_type);
+
+          return output_tensor.GetTensorMutableRawData();
+        };
+
         for (size_t i = 0; i < model_outputs.size(); i++) {
           const auto& output_name = model_outputs[i];
           const auto& output_info = model->GetInputOutputInfo(output_name);
@@ -183,34 +221,10 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
           if (model->IsInt64Output(output_name))
             output_type = ONNX_NAMESPACE::TensorProto_DataType_INT64;
 
-          auto output_tensor =
-              ctx.GetOutput(i, output_shape.data(), output_shape.size());
-
-          void* output_buffer;
-          switch (output_type) {
-            case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
-              output_buffer = output_tensor.GetTensorMutableData<float>();
-              break;
-            case ONNX_NAMESPACE::TensorProto_DataType_INT32:
-              output_buffer = output_tensor.GetTensorMutableData<int32_t>();
-              break;
-            case ONNX_NAMESPACE::TensorProto_DataType_INT64:
-              output_buffer = output_tensor.GetTensorMutableData<int64_t>();
-              break;
-            default:
-              return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
-                                     "Unsupported type: ", output_type, " for output: ", output_name);
-              break;
-          }
-
-          outputs.emplace(output_name,
-                          coreml::OnnxTensorData{
-                              coreml::OnnxTensorInfo{output_type, output_shape},
-                              output_buffer,
-                          });
+          outputs.emplace(output_name, coreml::OnnxTensorInfo{output_type, output_shape});
         }
 
-        return model->Predict(inputs, outputs);
+        return model->Predict(inputs, outputs, get_output_tensor_mutable_raw_data_fn);
       }
     };
 
diff --git a/onnxruntime/core/providers/coreml/model/host_utils.h b/onnxruntime/core/providers/coreml/model/host_utils.h
index 014e2147ca..f7f45bce08 100644
--- a/onnxruntime/core/providers/coreml/model/host_utils.h
+++ b/onnxruntime/core/providers/coreml/model/host_utils.h
@@ -6,6 +6,8 @@
 
 #pragma once
 
+#include <string>
+
 #define API_AVAILABLE_OS_VERSIONS API_AVAILABLE(macos(10.15), ios(13))
 
 // Base requireed OS to run CoreML Specification Version 4 (Core ML 3)
diff --git a/onnxruntime/core/providers/coreml/model/host_utils.mm b/onnxruntime/core/providers/coreml/model/host_utils.mm
index 2a5453ae53..4c394386cd 100644
--- a/onnxruntime/core/providers/coreml/model/host_utils.mm
+++ b/onnxruntime/core/providers/coreml/model/host_utils.mm
@@ -1,10 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#import <Foundation/Foundation.h>
+#include "core/providers/coreml/model/host_utils.h"
 
-#include <string>
-#include "host_utils.h"
+#import <Foundation/Foundation.h>
 
 namespace onnxruntime {
 namespace coreml {
diff --git a/onnxruntime/core/providers/coreml/model/model.h b/onnxruntime/core/providers/coreml/model/model.h
index 3a1f032196..c97a18839b 100644
--- a/onnxruntime/core/providers/coreml/model/model.h
+++ b/onnxruntime/core/providers/coreml/model/model.h
@@ -3,6 +3,13 @@
 
 #pragma once
 
+#include <cstddef>
+#include <functional>
+#include <unordered_set>
+
+#include "core/common/common.h"
+#include "core/common/gsl.h"
+#include "core/common/logging/logging.h"
 #include "core/common/status.h"
 #include "core/platform/ort_mutex.h"
 
@@ -21,6 +28,10 @@ struct OnnxTensorData {
   void* buffer{nullptr};
 };
 
+using GetOutputTensorMutableRawDataFn = std::function<void*(const std::string& name,
+                                                            int32_t requested_onnx_tensor_element_type,
+                                                            gsl::span<const int64_t> static_shape)>;
+
 class Model {
   friend class ModelBuilder;
 
@@ -28,8 +39,9 @@ class Model {
   ~Model();
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Model);
 
-  onnxruntime::common::Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
-                                      const std::unordered_map<std::string, OnnxTensorData>& outputs);
+  Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
+                 const std::unordered_map<std::string, OnnxTensorInfo>& outputs,
+                 const GetOutputTensorMutableRawDataFn& get_output_tensor_mutable_raw_data_fn);
 
   bool IsScalarOutput(const std::string& output_name) const;
 
@@ -45,6 +57,7 @@ class Model {
   const std::vector<std::string>& GetOutputs() const { return outputs_; }
   void SetOutputs(std::vector<std::string>&& outputs) { outputs_ = std::move(outputs); }
 
+  const OnnxTensorInfo* TryGetInputOutputInfo(const std::string& name) const;
   const OnnxTensorInfo& GetInputOutputInfo(const std::string& name) const;
 
  private:
@@ -60,7 +73,7 @@ class Model {
   OrtMutex mutex_;
 
   Model(const std::string& path, const logging::Logger& logger, uint32_t coreml_flags);
-  onnxruntime::common::Status LoadModel();
+  Status LoadModel();
 
   void SetInputOutputInfo(std::unordered_map<std::string, OnnxTensorInfo>&& input_output_info) {
     input_output_info_ = std::move(input_output_info);
@@ -76,4 +89,4 @@ class Model {
 };
 
 }  // namespace coreml
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index 1ce5acd2c1..b183125061 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -1,36 +1,108 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/providers/coreml/model/model.h"
+
+#import <CoreML/CoreML.h>
+#import <Foundation/Foundation.h>
+
+#include <algorithm>
 #include <cstdint>
 #include <unordered_map>
 #include <vector>
 
 #include "core/common/common.h"
+#include "core/common/gsl.h"
+#include "core/common/inlined_containers.h"
 #include "core/common/logging/logging.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/coreml_provider_factory.h"
-#include "host_utils.h"
-#include "model.h"
-
-#import <CoreML/CoreML.h>
-#import <Foundation/Foundation.h>
+#include "core/providers/coreml/model/host_utils.h"
+#include "core/providers/coreml/shape_utils.h"
 
 // force the linker to create a dependency on the CoreML framework so that in MAUI usage we don't need
 // to manually do this
 asm(".linker_option \"-framework\", \"CoreML\"");
 
+using namespace onnxruntime;
+using namespace onnxruntime::coreml;
+
+namespace {
+/**
+ * Computes the static output shape used to allocate the output tensor.
+ * `inferred_shape` is the inferred shape known at model compile time. It may contain dynamic dimensions (-1).
+ * `coreml_static_shape` is the static output shape of the CoreML MLMultiArray output. It must NOT contain dynamic
+ * dimensions.
+ */
+InlinedVector<int64_t> GetStaticOutputShape(gsl::span<const int64_t> inferred_shape,
+                                            gsl::span<const int64_t> coreml_static_shape,
+                                            const logging::Logger& logger) {
+  ORT_ENFORCE(IsStaticShape(coreml_static_shape),
+              "CoreML output shape (", Shape2String(coreml_static_shape), ") is not static.");
+
+  // return early if the shapes match
+  if (std::equal(inferred_shape.begin(), inferred_shape.end(),
+                 coreml_static_shape.begin(), coreml_static_shape.end())) {
+    return InlinedVector<int64_t>(coreml_static_shape.begin(), coreml_static_shape.end());
+  }
+
+  // Special CoreML behavior notes:
+  // - Sometimes the CoreML output shape has extra leading ones.
+
+  ORT_ENFORCE(inferred_shape.size() <= coreml_static_shape.size(),
+              "CoreML static output shape (", Shape2String(coreml_static_shape),
+              ") has fewer elements than the inferred shape (", Shape2String(inferred_shape), ").");
+
+  // if coreml_static_shape has more elements, we expect them to be leading ones
+  const size_t num_leading_dimensions = coreml_static_shape.size() - inferred_shape.size();
+  const auto coreml_static_shape_common_begin = coreml_static_shape.begin() + num_leading_dimensions;
+
+  if (num_leading_dimensions > 0) {
+    const bool has_only_leading_ones =
+        std::all_of(coreml_static_shape.begin(), coreml_static_shape_common_begin,
+                    [](int64_t dim) { return dim == 1; });
+    ORT_ENFORCE(has_only_leading_ones, "CoreML static output shape (", Shape2String(coreml_static_shape),
+                ") has leading dimensions with value other than 1.");
+  }
+
+  InlinedVector<int64_t> static_shape{};
+  static_shape.reserve(inferred_shape.size());
+  std::transform(inferred_shape.begin(), inferred_shape.end(),
+                 coreml_static_shape_common_begin,
+                 std::back_inserter(static_shape),
+                 [&](int64_t inferred_dim, int64_t coreml_static_dim) {
+                   ORT_ENFORCE(inferred_dim == -1 || inferred_dim == coreml_static_dim,
+                               "CoreML static output shape (", Shape2String(coreml_static_shape),
+                               ") and inferred shape (", Shape2String(inferred_shape),
+                               ") have an inconsistent static dimensions (", coreml_static_dim, " vs. ",
+                               inferred_dim, ").");
+
+                   return inferred_dim != -1 ? inferred_dim : coreml_static_dim;
+                 });
+
+  // Ideally, the CoreML static shape would match the inferred shape exactly, apart from the former providing values
+  // for -1's in the latter. For now, this is not the case so it is probably worth logging them.
+  LOGS(logger, VERBOSE) << "CoreML static output shape: " << Shape2String(coreml_static_shape)
+                        << ", inferred shape: " << Shape2String(inferred_shape)
+                        << ", resulting static output shape: " << Shape2String(static_shape);
+  return static_shape;
+}
+}  // namespace
+
+NS_ASSUME_NONNULL_BEGIN
+
 // Model input for a CoreML model
 // All the input onnx tensors values will be converted to MLMultiArray(s)
 @interface OnnxTensorFeatureProvider : NSObject <MLFeatureProvider> {
-  const std::unordered_map<std::string, onnxruntime::coreml::OnnxTensorData>* inputs_;
+  const std::unordered_map<std::string, OnnxTensorData>* inputs_;
   NSSet* featureNames_;
-  const onnxruntime::logging::Logger* logger_;
+  const logging::Logger* logger_;
 }
 
-- (instancetype)initWithInputs:(const std::unordered_map<std::string, onnxruntime::coreml::OnnxTensorData>&)inputs
-                        logger:(const onnxruntime::logging::Logger*)logger;
-- (MLFeatureValue*)featureValueForName:(NSString*)featureName API_AVAILABLE_OS_VERSIONS;
+- (instancetype)initWithInputs:(const std::unordered_map<std::string, OnnxTensorData>&)inputs
+                        logger:(const logging::Logger&)logger;
+- (nullable MLFeatureValue*)featureValueForName:(NSString*)featureName API_AVAILABLE_OS_VERSIONS;
 - (NSSet<NSString*>*)featureNames;
 
 @end
@@ -42,19 +114,20 @@ asm(".linker_option \"-framework\", \"CoreML\"");
 @interface CoreMLExecution : NSObject {
   NSString* coreml_model_path_;
   NSString* compiled_model_path_;
-  const onnxruntime::logging::Logger* logger_;
+  const logging::Logger* logger_;
   uint32_t coreml_flags_;
 }
 
 - (instancetype)initWithPath:(const std::string&)path
-                      logger:(const onnxruntime::logging::Logger&)logger
+                      logger:(const logging::Logger&)logger
                 coreml_flags:(uint32_t)coreml_flags;
 - (void)cleanup;
 - (void)dealloc;
-- (onnxruntime::common::Status)loadModel API_AVAILABLE_OS_VERSIONS;
-- (onnxruntime::common::Status)
-    predict:(const std::unordered_map<std::string, onnxruntime::coreml::OnnxTensorData>&)inputs
-    outputs:(const std::unordered_map<std::string, onnxruntime::coreml::OnnxTensorData>&)outputs
+- (Status)loadModel API_AVAILABLE_OS_VERSIONS;
+- (Status)predict:(const std::unordered_map<std::string, OnnxTensorData>&)inputs
+                  outputs:(const std::unordered_map<std::string, OnnxTensorInfo>&)outputs
+    getOutputTensorDataFn:(const GetOutputTensorMutableRawDataFn&)
+                              get_output_tensor_mutable_raw_data_fn
     API_AVAILABLE_OS_VERSIONS;
 
 @property MLModel* model API_AVAILABLE_OS_VERSIONS;
@@ -63,16 +136,16 @@ asm(".linker_option \"-framework\", \"CoreML\"");
 
 @implementation OnnxTensorFeatureProvider
 
-- (instancetype)initWithInputs:(const std::unordered_map<std::string, onnxruntime::coreml::OnnxTensorData>&)inputs
-                        logger:(const onnxruntime::logging::Logger*)logger {
+- (instancetype)initWithInputs:(const std::unordered_map<std::string, OnnxTensorData>&)inputs
+                        logger:(const logging::Logger&)logger {
   if (self = [super init]) {
     inputs_ = &inputs;
-    logger_ = logger;
+    logger_ = &logger;
   }
   return self;
 }
 
-- (nonnull NSSet<NSString*>*)featureNames {
+- (NSSet<NSString*>*)featureNames {
   if (featureNames_ == nil) {
     NSMutableArray* names = [[NSMutableArray alloc] init];
     for (const auto& input : *inputs_) {
@@ -88,7 +161,7 @@ asm(".linker_option \"-framework\", \"CoreML\"");
   return featureNames_;
 }
 
-- (nullable MLFeatureValue*)featureValueForName:(nonnull NSString*)featureName {
+- (nullable MLFeatureValue*)featureValueForName:(NSString*)featureName {
   auto it = inputs_->find([featureName cStringUsingEncoding:NSUTF8StringEncoding]);
   if (it != inputs_->end()) {
     auto& input = it->second;
@@ -118,8 +191,9 @@ asm(".linker_option \"-framework\", \"CoreML\"");
                                                                 shape:shape
                                                              dataType:data_type
                                                               strides:strides
-                                                          deallocator:(^(void* /* bytes */){
-                                                                      })error:&error];
+                                                          deallocator:^(void* /* bytes */) {
+                                                          }
+                                                                error:&error];
     if (error != nil) {
       LOGS(*logger_, ERROR) << "Failed to create MLMultiArray for feature: " << [featureName UTF8String]
                             << ", error: " << [[error localizedDescription] UTF8String];
@@ -139,7 +213,7 @@ asm(".linker_option \"-framework\", \"CoreML\"");
 @implementation CoreMLExecution
 
 - (instancetype)initWithPath:(const std::string&)path
-                      logger:(const onnxruntime::logging::Logger&)logger
+                      logger:(const logging::Logger&)logger
                 coreml_flags:(uint32_t)coreml_flags {
   if (self = [super init]) {
     coreml_model_path_ = [NSString stringWithUTF8String:path.c_str()];
@@ -175,7 +249,7 @@ asm(".linker_option \"-framework\", \"CoreML\"");
   [self cleanup];
 }
 
-- (onnxruntime::common::Status)loadModel {
+- (Status)loadModel {
   NSError* error = nil;
   NSURL* modelUrl = [NSURL URLWithString:coreml_model_path_];
   NSAssert(modelUrl != nil, @"modelUrl must not be nil");
@@ -199,100 +273,122 @@ asm(".linker_option \"-framework\", \"CoreML\"");
                            [[error localizedDescription] cStringUsingEncoding:NSUTF8StringEncoding]);
   }
 
-  return onnxruntime::common::Status::OK();
+  return Status::OK();
 }
 
-- (onnxruntime::common::Status)
-    predict:(const std::unordered_map<std::string, onnxruntime::coreml::OnnxTensorData>&)inputs
-    outputs:(const std::unordered_map<std::string, onnxruntime::coreml::OnnxTensorData>&)outputs {
-  if (_model == nil) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model is not loaded");
-  }
-
-  OnnxTensorFeatureProvider* input_feature = [[OnnxTensorFeatureProvider alloc] initWithInputs:inputs
-                                                                                        logger:logger_];
-
-  if (input_feature == nil) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "inputFeature is not initialized");
-  }
-
-  MLPredictionOptions* options = [[MLPredictionOptions alloc] init];
-  NSError* error = nil;
-  id<MLFeatureProvider> output_feature = [_model predictionFromFeatures:input_feature
-                                                                options:options
-                                                                  error:&error];
-
-  if (error != nil) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Error executing model: ",
-                           [[error localizedDescription] cStringUsingEncoding:NSUTF8StringEncoding]);
-  }
-
-  for (auto& output : outputs) {
-    NSString* output_name = [NSString stringWithCString:output.first.c_str()
-                                               encoding:[NSString defaultCStringEncoding]];
-    NSAssert(output_name != nil, @"output_name must not be nil");
-    MLFeatureValue* output_value =
-        [output_feature featureValueForName:output_name];
-
-    if (output_value == nil) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "output_feature has no value for ",
-                             [output_name cStringUsingEncoding:NSUTF8StringEncoding]);
+- (Status)predict:(const std::unordered_map<std::string, OnnxTensorData>&)inputs
+                  outputs:(const std::unordered_map<std::string, OnnxTensorInfo>&)outputs
+    getOutputTensorDataFn:(const GetOutputTensorMutableRawDataFn&)get_output_tensor_mutable_raw_data_fn {
+  Status status = Status::OK();
+  ORT_TRY {
+    if (_model == nil) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model is not loaded");
     }
 
-    auto* data = [output_value multiArrayValue];
-    auto* model_output_data = data.dataPointer;
-    if (model_output_data == nullptr) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model_output_data has no data for ",
-                             [output_name cStringUsingEncoding:NSUTF8StringEncoding]);
+    OnnxTensorFeatureProvider* input_feature = [[OnnxTensorFeatureProvider alloc] initWithInputs:inputs
+                                                                                          logger:*logger_];
+
+    if (input_feature == nil) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "inputFeature is not initialized");
     }
 
-    auto model_output_type = data.dataType;
+    MLPredictionOptions* options = [[MLPredictionOptions alloc] init];
+    NSError* error = nil;
+    id<MLFeatureProvider> output_feature = [_model predictionFromFeatures:input_feature
+                                                                  options:options
+                                                                    error:&error];
 
-    auto& output_tensor = output.second;
-    size_t num_elements =
-        accumulate(output_tensor.tensor_info.shape.begin(),
-                   output_tensor.tensor_info.shape.end(),
-                   1,
-                   std::multiplies<int64_t>());
+    if (error != nil) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Error executing model: ",
+                             [[error localizedDescription] cStringUsingEncoding:NSUTF8StringEncoding]);
+    }
 
-    const auto type = output_tensor.tensor_info.data_type;
-    switch (type) {
-      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
-        const auto output_data_byte_size = num_elements * sizeof(float);
-        memcpy(output_tensor.buffer, model_output_data, output_data_byte_size);
-        break;
+    for (const auto& [output_name, output_tensor_info] : outputs) {
+      MLFeatureValue* output_value =
+          [output_feature featureValueForName:[NSString stringWithUTF8String:output_name.c_str()]];
+
+      if (output_value == nil) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "output_feature has no value for ", output_name);
       }
-      case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
-        const auto output_data_byte_size = num_elements * sizeof(int32_t);
-        memcpy(output_tensor.buffer, model_output_data, output_data_byte_size);
-        break;
-      }
-      // For this case, since Coreml Spec only uses int32 for model output while onnx provides
-      // int64 for model output data type. We are doing a type casting (int32 -> int64) here
-      // when copying the model to ORT
-      case ONNX_NAMESPACE::TensorProto_DataType_INT64:
-        if (model_output_type == MLMultiArrayDataTypeInt32) {
-          int32_t* model_output_data_prime = static_cast<int32_t*>(model_output_data);
-          int64_t* output_tensor_buffer_prime = static_cast<int64_t*>(output_tensor.buffer);
-          for (size_t i = 0; i < num_elements; i++) {
-            output_tensor_buffer_prime[i] = model_output_data_prime[i];
-          }
+
+      auto* data = [output_value multiArrayValue];
+
+      const auto coreml_static_output_shape = [&]() {
+        InlinedVector<int64_t> result;
+        result.reserve(data.shape.count);
+        for (NSNumber* dim in data.shape) {
+          const auto dim_value = dim.longLongValue;
+          result.push_back(dim_value);
         }
-        ORT_RETURN_IF_NOT(model_output_type == MLMultiArrayDataTypeInt32,
-                          "Coreml model_output_type is not MLMultiArrayDataTypeInt32 for the case");
-        break;
-      default:
-        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
-                               "Output data type is not supported, actual type: ",
-                               type);
+        return result;
+      }();
+
+      const auto static_output_shape = GetStaticOutputShape(output_tensor_info.shape, coreml_static_output_shape,
+                                                            *logger_);
+
+      void* output_buffer = get_output_tensor_mutable_raw_data_fn(output_name, output_tensor_info.data_type,
+                                                                  static_output_shape);
+
+      if (const size_t num_elements = data.count; num_elements > 0) {
+        if (const auto shape_size = ShapeSize(static_output_shape);
+            shape_size < 0 || num_elements != static_cast<size_t>(shape_size)) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                                 "CoreML MLMultiArray count (", num_elements, ") and shape size (", shape_size,
+                                 ") do not match");
+        }
+
+        const void* model_output_data = data.dataPointer;
+
+        if (model_output_data == nullptr) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model_output_data has no data for ", output_name);
+        }
+
+        const auto onnx_data_type = output_tensor_info.data_type;
+        switch (onnx_data_type) {
+          case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
+            const auto output_data_byte_size = num_elements * sizeof(float);
+            memcpy(output_buffer, model_output_data, output_data_byte_size);
+            break;
+          }
+          case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
+            const auto output_data_byte_size = num_elements * sizeof(int32_t);
+            memcpy(output_buffer, model_output_data, output_data_byte_size);
+            break;
+          }
+          // For this case, since Coreml Spec only uses int32 for model output while onnx provides
+          // int64 for model output data type. We are doing a type casting (int32 -> int64) here
+          // when copying the model to ORT
+          case ONNX_NAMESPACE::TensorProto_DataType_INT64: {
+            ORT_RETURN_IF_NOT(data.dataType == MLMultiArrayDataTypeInt32,
+                              "CoreML output data type is not MLMultiArrayDataTypeInt32");
+
+            const int32_t* model_output_data_i32 = static_cast<const int32_t*>(model_output_data);
+            int64_t* output_tensor_buffer_i64 = static_cast<int64_t*>(output_buffer);
+            for (size_t i = 0; i < num_elements; i++) {
+              output_tensor_buffer_i64[i] = model_output_data_i32[i];
+            }
+            break;
+          }
+          default:
+            return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                                   "Output data type is not supported, actual type: ", onnx_data_type);
+        }
+      }
     }
   }
+  ORT_CATCH(const std::exception& e) {
+    ORT_HANDLE_EXCEPTION([&]() {
+      status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Exception: ", e.what());
+    });
+  }
 
-  return onnxruntime::common::Status::OK();
+  return status;
 }
 
 @end
 
+NS_ASSUME_NONNULL_END
+
 namespace onnxruntime {
 namespace coreml {
 
@@ -305,7 +401,8 @@ class Execution {
 
   Status LoadModel();
   Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
-                 const std::unordered_map<std::string, OnnxTensorData>& outputs);
+                 const std::unordered_map<std::string, OnnxTensorInfo>& outputs,
+                 const GetOutputTensorMutableRawDataFn& get_output_tensor_mutable_raw_data_fn);
 
  private:
   bool model_loaded{false};
@@ -338,12 +435,15 @@ Status Execution::LoadModel() {
 }
 
 Status Execution::Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
-                          const std::unordered_map<std::string, OnnxTensorData>& outputs) {
+                          const std::unordered_map<std::string, OnnxTensorInfo>& outputs,
+                          const GetOutputTensorMutableRawDataFn& get_output_tensor_mutable_raw_data_fn) {
   ORT_RETURN_IF_NOT(model_loaded, "Execution::Predict requires Execution::LoadModel");
 
   if (HAS_VALID_BASE_OS_VERSION) {
     @autoreleasepool {
-      return [execution_ predict:inputs outputs:outputs];
+      return [execution_ predict:inputs
+                         outputs:outputs
+           getOutputTensorDataFn:get_output_tensor_mutable_raw_data_fn];
     }
   }
 
@@ -361,8 +461,9 @@ Status Model::LoadModel() {
 }
 
 Status Model::Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
-                      const std::unordered_map<std::string, OnnxTensorData>& outputs) {
-  return execution_->Predict(inputs, outputs);
+                      const std::unordered_map<std::string, OnnxTensorInfo>& outputs,
+                      const GetOutputTensorMutableRawDataFn& get_output_tensor_mutable_raw_data_fn) {
+  return execution_->Predict(inputs, outputs, get_output_tensor_mutable_raw_data_fn);
 }
 
 bool Model::IsScalarOutput(const std::string& output_name) const {
@@ -373,8 +474,15 @@ bool Model::IsInt64Output(const std::string& output_name) const {
   return Contains(int64_outputs_, output_name);
 }
 
+const OnnxTensorInfo* Model::TryGetInputOutputInfo(const std::string& name) const {
+  const auto info_it = input_output_info_.find(name);
+  return info_it != input_output_info_.end() ? &info_it->second : nullptr;
+}
+
 const OnnxTensorInfo& Model::GetInputOutputInfo(const std::string& name) const {
-  return input_output_info_.at(name);
+  const auto* info = TryGetInputOutputInfo(name);
+  ORT_ENFORCE(info != nullptr, "Failed to get info for input/output: ", name);
+  return *info;
 }
 
 }  // namespace coreml
diff --git a/onnxruntime/core/providers/coreml/shape_utils.cc b/onnxruntime/core/providers/coreml/shape_utils.cc
new file mode 100644
index 0000000000..255aba0e8c
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/shape_utils.cc
@@ -0,0 +1,68 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/coreml/shape_utils.h"
+
+#include "core/framework/tensor_shape.h"
+#include "core/framework/tensorprotoutils.h"
+
+namespace onnxruntime::coreml {
+
+namespace {
+bool GetShapeImpl(const NodeArg& node_arg, std::vector<int64_t>& shape_out, const logging::Logger& logger,
+                  bool allow_dynamic_shape) {
+  const auto* shape_proto = node_arg.Shape();
+  if (!shape_proto) {
+    LOGS(logger, VERBOSE) << "NodeArg [" << node_arg.Name() << "] has no shape info";
+    return false;
+  }
+
+  std::vector<int64_t> shape{};
+  shape.reserve(shape_proto->dim().size());
+
+  for (int i = 0; i < shape_proto->dim().size(); ++i) {
+    const auto& dim = shape_proto->dim(i);
+    if (utils::HasDimValue(dim)) {
+      const auto dim_value = dim.dim_value();
+      ORT_ENFORCE(dim_value >= 0, "NodeArg [", node_arg.Name(), "] has a negative dimension value");
+      shape.push_back(dim_value);
+    } else {
+      // dynamic dimension
+      if (!allow_dynamic_shape) {
+        LOGS(logger, VERBOSE) << "NodeArg [" << node_arg.Name() << "] has shape with dynamic dimension";
+        return false;
+      }
+      shape.push_back(-1);
+    }
+  }
+
+  shape_out = std::move(shape);
+  return true;
+}
+}  // namespace
+
+bool GetShape(const NodeArg& node_arg, std::vector<int64_t>& shape, const logging::Logger& logger) {
+  return GetShapeImpl(node_arg, shape, logger, /* allow_dynamic_shape */ true);
+}
+
+bool GetStaticShape(const NodeArg& node_arg, std::vector<int64_t>& shape, const logging::Logger& logger) {
+  return GetShapeImpl(node_arg, shape, logger, /* allow_dynamic_shape */ false);
+}
+
+bool IsStaticShape(gsl::span<const int64_t> shape) {
+  return std::find(shape.begin(), shape.end(), int64_t{-1}) == shape.end();
+}
+
+bool DoesShapeSpecifyZeroElements(gsl::span<const int64_t> shape) {
+  return std::find(shape.begin(), shape.end(), int64_t{0}) != shape.end();
+}
+
+int64_t ShapeSize(gsl::span<const int64_t> shape) {
+  return TensorShape(shape).Size();
+}
+
+std::string Shape2String(gsl::span<const int64_t> shape) {
+  return TensorShape(shape).ToString();
+}
+
+}  // namespace onnxruntime::coreml
diff --git a/onnxruntime/core/providers/coreml/shape_utils.h b/onnxruntime/core/providers/coreml/shape_utils.h
new file mode 100644
index 0000000000..0a1fd47cfd
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/shape_utils.h
@@ -0,0 +1,35 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "core/common/gsl.h"
+#include "core/common/logging/logging.h"
+#include "core/graph/node_arg.h"
+
+namespace onnxruntime::coreml {
+
+// Gets `node_arg`'s shape. Dynamic dimensions will have a value of -1. All other dimensions will be non-negative.
+bool GetShape(const NodeArg& node_arg, std::vector<int64_t>& shape, const logging::Logger& logger);
+
+// Gets `node_arg`'s shape if it has no dynamic dimensions. All dimensions will be non-negative.
+bool GetStaticShape(const NodeArg& node_arg, std::vector<int64_t>& shape, const logging::Logger& logger);
+
+// True iff `shape` has no dynamic dimensions.
+bool IsStaticShape(gsl::span<const int64_t> shape);
+
+// True iff `shape` specifies zero elements with its non-dynamic dimensions. Like `TensorShape::Size() == 0`, but it
+// does not compute the size.
+bool DoesShapeSpecifyZeroElements(gsl::span<const int64_t> shape);
+
+// Gets the number of elements contained by the shape or -1 if the shape has any dynamic dimensions.
+int64_t ShapeSize(gsl::span<const int64_t> shape);
+
+// Gets a string representation of `shape`.
+std::string Shape2String(gsl::span<const int64_t> shape);
+
+}  // namespace onnxruntime::coreml
diff --git a/onnxruntime/core/providers/cpu/tensor/reshape_helper.h b/onnxruntime/core/providers/cpu/tensor/reshape_helper.h
index 2e4b741d26..5961686674 100644
--- a/onnxruntime/core/providers/cpu/tensor/reshape_helper.h
+++ b/onnxruntime/core/providers/cpu/tensor/reshape_helper.h
@@ -11,6 +11,10 @@ namespace onnxruntime {
 class ReshapeHelper {
  public:
   ReshapeHelper(const TensorShape& input_shape, TensorShapeVector& requested_shape, bool allow_zero = false) {
+    const auto input_shape_size = input_shape.Size();
+    ORT_ENFORCE(input_shape_size != -1,
+                "The input tensor must not have any dynamic (-1) dimensions. Input shape:", input_shape);
+
     auto nDims = requested_shape.size();
     ptrdiff_t unknown_dim = -1;
     int64_t size = 1;
@@ -32,12 +36,12 @@ class ReshapeHelper {
 
     if (unknown_dim != -1) {
       // calculate unknown dimension
-      ORT_ENFORCE(size != 0 && (input_shape.Size() % size) == 0,
+      ORT_ENFORCE(size != 0 && (input_shape_size % size) == 0,
                   "The input tensor cannot be reshaped to the requested shape. Input shape:", input_shape, ", requested shape:", TensorShape(requested_shape));
-      requested_shape[unknown_dim] = input_shape.Size() / size;
+      requested_shape[unknown_dim] = input_shape_size / size;
     } else {
       // check if the output shape is valid.
-      ORT_ENFORCE(gsl::narrow_cast<int64_t>(input_shape.Size()) == size,
+      ORT_ENFORCE(input_shape_size == size,
                   "The input tensor cannot be reshaped to the requested shape. Input shape:", input_shape, ", requested shape:", TensorShape(requested_shape));
     }
   }
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index 87573286f1..459a8c71ad 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -771,6 +771,8 @@ void BaseTester::ExecuteModelForEps(
 
   provider_type.resize(provider_type.size() - 1);  // remove the trailing ':'
 
+  SCOPED_TRACE(MakeString("registered execution providers: ", provider_type));
+
   if (custom_registries != nullptr) {
     for (const auto& reg : *custom_registries) {
       ASSERT_PROVIDER_STATUS_OK(session_object.RegisterCustomRegistry(reg));
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index d256356d67..d26df8c52e 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -21,7 +21,6 @@
 #include "gtest/gtest.h"
 #include "gmock/gmock.h"
 
-using namespace std;
 using namespace ONNX_NAMESPACE;
 using namespace ::onnxruntime::logging;
 
diff --git a/onnxruntime/test/providers/coreml/dynamic_input_test.cc b/onnxruntime/test/providers/coreml/dynamic_input_test.cc
new file mode 100644
index 0000000000..39343bcfac
--- /dev/null
+++ b/onnxruntime/test/providers/coreml/dynamic_input_test.cc
@@ -0,0 +1,118 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "gtest/gtest.h"
+
+#include <memory>
+#include <vector>
+
+#include "core/providers/coreml/coreml_execution_provider.h"
+#include "core/providers/coreml/coreml_provider_factory.h"  // for COREMLFlags
+#include "test/common/random_generator.h"
+#include "test/providers/model_tester.h"
+#include "test/util/include/current_test_name.h"
+#include "test/util/include/test_utils.h"
+
+namespace onnxruntime::test {
+
+TEST(CoreMLExecutionProviderDynamicInputShapeTest, MatMul) {
+  constexpr auto model_path = ORT_TSTR("testdata/matmul_with_dynamic_input_shape.onnx");
+
+  auto test = [&](const size_t M) {
+    SCOPED_TRACE(MakeString("M=", M));
+
+    auto coreml_ep = std::make_unique<CoreMLExecutionProvider>(0);
+
+    const auto ep_verification_params = EPVerificationParams{
+        ExpectedEPNodeAssignment::All,
+        2e-3f,
+    };
+
+    RandomValueGenerator gen{1234};
+    const auto A_shape = std::vector<int64_t>{static_cast<int64_t>(M), 2};
+    const auto A_data = gen.Uniform<float>(A_shape, 0.0f, 1.0f);
+
+    OrtValue A = CreateInputOrtValueOnCPU<float>(A_shape, A_data);
+
+    RunAndVerifyOutputsWithEP(model_path, CurrentTestName(),
+                              std::move(coreml_ep),
+                              {{"A", A}},
+                              ep_verification_params);
+  };
+
+  for (size_t i = 1; i <= 5; ++i) {
+    test(i);
+  }
+}
+
+TEST(CoreMLExecutionProviderDynamicInputShapeTest, MobileNetExcerpt) {
+  constexpr auto model_path = ORT_TSTR("testdata/mobilenet_v3_small_excerpt.onnx");
+
+  auto test = [&](const size_t batch_size) {
+    SCOPED_TRACE(MakeString("batch_size=", batch_size));
+
+    auto coreml_ep = std::make_unique<CoreMLExecutionProvider>(0);
+
+    const auto ep_verification_params = EPVerificationParams{
+        ExpectedEPNodeAssignment::All,
+        5e-2f,
+    };
+
+    RandomValueGenerator gen{1234};
+    const auto input_shape = std::vector<int64_t>{static_cast<int64_t>(batch_size), 3, 224, 224};
+    const auto input_data = gen.Uniform<float>(input_shape, 0.0f, 1.0f);
+
+    OrtValue input = CreateInputOrtValueOnCPU<float>(input_shape, input_data);
+
+    RunAndVerifyOutputsWithEP(model_path, CurrentTestName(),
+                              std::move(coreml_ep),
+                              {{"input", input}},
+                              ep_verification_params);
+  };
+
+  for (size_t i = 1; i <= 5; ++i) {
+    test(i);
+  }
+}
+
+TEST(CoreMLExecutionProviderDynamicInputShapeTest, EmptyInputFails) {
+  constexpr auto model_path = ORT_TSTR("testdata/matmul_with_dynamic_input_shape.onnx");
+
+  ModelTester tester(CurrentTestName(), model_path);
+
+  tester.AddInput<float>("A", {0, 2}, {});
+  tester.AddOutput<float>("Y", {0, 4}, {});
+
+  auto eps = std::vector<std::unique_ptr<IExecutionProvider>>{};
+  eps.emplace_back(std::make_unique<CoreMLExecutionProvider>(0));
+
+  tester
+      .Config(ModelTester::ExpectResult::kExpectFailure,
+              "the runtime shape ({0,2}) has zero elements. This is not supported by the CoreML EP.")
+      .ConfigEps(std::move(eps))
+      .RunWithConfig();
+}
+
+TEST(CoreMLExecutionProviderDynamicInputShapeTest, OnlyAllowStaticInputShapes) {
+  constexpr auto model_path = ORT_TSTR("testdata/matmul_with_dynamic_input_shape.onnx");
+
+  auto coreml_ep = std::make_unique<CoreMLExecutionProvider>(COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES);
+
+  const auto ep_verification_params = EPVerificationParams{
+      ExpectedEPNodeAssignment::None,  // expect no supported nodes because we disable dynamic input shape support
+  };
+
+  constexpr size_t M = 3;
+  RandomValueGenerator gen{1234};
+  const auto A_shape = std::vector<int64_t>{static_cast<int64_t>(M), 2};
+  const auto A_data = gen.Uniform<float>(A_shape, 0.0f, 1.0f);
+
+  OrtValue A = CreateInputOrtValueOnCPU<float>(A_shape, A_data);
+
+  RunAndVerifyOutputsWithEP(model_path, CurrentTestName(),
+                            std::move(coreml_ep),
+                            {{"A", A}},
+                            ep_verification_params);
+}
+
+}  // namespace onnxruntime::test
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index afc70b1a5c..b44ba75486 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -3244,8 +3244,14 @@ TEST(ReductionOpTest, ReduceDimWithZero1) {
     auto expect = error_msg.empty() ? OpTester::ExpectResult::kExpectSuccess
                                     : OpTester::ExpectResult::kExpectFailure;
 
-    // exclude OpenVINO and TensorRT as this isn't handled by those EPs
-    tester.Run(expect, error_msg, {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kQnnExecutionProvider});
+    tester.Run(expect, error_msg,
+               // exclude EPs that don't handle this
+               {
+                   kCoreMLExecutionProvider,
+                   kOpenVINOExecutionProvider,
+                   kQnnExecutionProvider,
+                   kTensorrtExecutionProvider,
+               });
   };
 
   // reduce on all axes keeping dims. should allow the 0 to be the reduced value
@@ -3285,8 +3291,14 @@ TEST(ReductionOpTest, ReduceDimWithZero2) {
     auto expect = error_msg.empty() ? OpTester::ExpectResult::kExpectSuccess
                                     : OpTester::ExpectResult::kExpectFailure;
 
-    // exclude OpenVINO and TensorRT as this isn't handled by those EPs
-    tester.Run(expect, error_msg, {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kQnnExecutionProvider});
+    tester.Run(expect, error_msg,
+               // exclude EPs that don't handle this
+               {
+                   kOpenVINOExecutionProvider,
+                   kQnnExecutionProvider,
+                   kTensorrtExecutionProvider,
+                   kCoreMLExecutionProvider,
+               });
   };
 
   // reduction without keeping dims on all axes. can't reduce on an axis with value of 0
@@ -3323,8 +3335,14 @@ TEST(ReductionOpTest, ReduceSum_ReduceDimWithZero3) {
     auto expect = error_msg.empty() ? OpTester::ExpectResult::kExpectSuccess
                                     : OpTester::ExpectResult::kExpectFailure;
 
-    // exclude OpenVINO and TensorRT as this isn't handled by those EPs
-    tester.Run(expect, error_msg, {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kQnnExecutionProvider});
+    tester.Run(expect, error_msg,
+               // exclude EPs that don't handle this
+               {
+                   kCoreMLExecutionProvider,
+                   kTensorrtExecutionProvider,
+                   kOpenVINOExecutionProvider,
+                   kQnnExecutionProvider,
+               });
   };
 
   // reduction is possible without keeping dims if we only reduce on non-zero dims
diff --git a/onnxruntime/test/testdata/matmul_with_dynamic_input_shape.onnx b/onnxruntime/test/testdata/matmul_with_dynamic_input_shape.onnx
new file mode 100644
index 0000000000..69a5f19f2c
Binary files /dev/null and b/onnxruntime/test/testdata/matmul_with_dynamic_input_shape.onnx differ
diff --git a/onnxruntime/test/testdata/matmul_with_dynamic_input_shape.py b/onnxruntime/test/testdata/matmul_with_dynamic_input_shape.py
new file mode 100644
index 0000000000..111cc9e442
--- /dev/null
+++ b/onnxruntime/test/testdata/matmul_with_dynamic_input_shape.py
@@ -0,0 +1,34 @@
+from pathlib import Path
+
+import onnx
+from onnx import TensorProto, helper
+
+# This model contains a MatMul where:
+# - A has shape [M, K] and `M` is a dynamic dimension.
+# - B is an initializer with shape [K, N].
+#   - This is important for the CoreML EP which only handles the case where B is an initializer.
+
+# M is dynamic
+M = "M"
+K = 2
+N = 4
+
+graph = helper.make_graph(
+    [  # nodes
+        helper.make_node("MatMul", ["A", "B"], ["Y"], "MatMul"),
+    ],
+    "MatMulWithDynamicInputShape",  # name
+    [  # inputs
+        helper.make_tensor_value_info("A", TensorProto.FLOAT, [M, K]),
+    ],
+    [  # outputs
+        helper.make_tensor_value_info("Y", TensorProto.FLOAT, [M, N]),
+    ],
+    [  # initializers
+        helper.make_tensor("B", TensorProto.FLOAT, [K, N], [float(i) for i in range(K * N)]),
+    ],
+)
+
+opset_imports = [helper.make_operatorsetid("", 19)]
+model = helper.make_model(graph, opset_imports=opset_imports)
+onnx.save(model, str(Path(__file__).parent / "matmul_with_dynamic_input_shape.onnx"))
diff --git a/onnxruntime/test/testdata/mobilenet_v3_small_excerpt.onnx b/onnxruntime/test/testdata/mobilenet_v3_small_excerpt.onnx
new file mode 100644
index 0000000000..23caad00a3
Binary files /dev/null and b/onnxruntime/test/testdata/mobilenet_v3_small_excerpt.onnx differ
diff --git a/onnxruntime/test/testdata/mobilenet_v3_small_excerpt_gen.py b/onnxruntime/test/testdata/mobilenet_v3_small_excerpt_gen.py
new file mode 100644
index 0000000000..fac80acd6d
--- /dev/null
+++ b/onnxruntime/test/testdata/mobilenet_v3_small_excerpt_gen.py
@@ -0,0 +1,125 @@
+"""
+Run this script to recreate the original onnx model.
+Example usage:
+python mobilenet_v3_small_excerpt_gen.py out_model_path.onnx
+
+The excerpt model and this script were generated from a full model by first extracting the excerpt with
+onnx.utils.extract_model [1] and then generating the python script from the excerpt model with onnx2py [2].
+
+[1]: https://github.com/onnx/onnx/blob/v1.14.0/docs/PythonAPIOverview.md#extracting-sub-model-with-inputs-outputs-tensor-names
+[2]: https://github.com/microsoft/onnxconverter-common/blob/v1.13.0/onnxconverter_common/onnx2py.py
+"""
+
+import os
+import sys
+
+import numpy as np
+import onnx
+from onnx import TensorProto, helper, numpy_helper
+
+DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "mobilenet_v3_small_excerpt_gen")
+
+
+def clear_field(proto, field):
+    proto.ClearField(field)
+    return proto
+
+
+def order_repeated_field(repeated_proto, key_name, order):
+    order = list(order)
+    repeated_proto.sort(key=lambda x: order.index(getattr(x, key_name)))
+
+
+def make_node(op_type, inputs, outputs, name=None, doc_string=None, domain=None, **kwargs):
+    node = helper.make_node(op_type, inputs, outputs, name, doc_string, domain, **kwargs)
+    if doc_string == "":
+        node.doc_string = ""
+    order_repeated_field(node.attribute, "name", kwargs.keys())
+    return node
+
+
+def make_graph(*args, doc_string=None, **kwargs):
+    graph = helper.make_graph(*args, doc_string=doc_string, **kwargs)
+    if doc_string == "":
+        graph.doc_string = ""
+    return graph
+
+
+model = helper.make_model(
+    opset_imports=[clear_field(helper.make_operatorsetid("", 13), "domain")],
+    ir_version=6,
+    producer_name="onnx.utils.extract_model",
+    graph=make_graph(
+        name="Extracted from {torch-jit-export}",
+        inputs=[helper.make_tensor_value_info("input", TensorProto.FLOAT, shape=["batch_size", 3, 224, 224])],
+        outputs=[helper.make_tensor_value_info("254", TensorProto.FLOAT, shape=["batch_size", 16, 112, 112])],
+        initializer=[
+            numpy_helper.from_array(
+                np.load(os.path.join(DATA_DIR, "const0_535.npy")).astype("float32").reshape([16, 3, 3, 3]), name="535"
+            ),
+            numpy_helper.from_array(
+                np.load(os.path.join(DATA_DIR, "const1_536.npy")).astype("float32").reshape([16]), name="536"
+            ),
+        ],
+        value_info=[
+            helper.make_tensor_value_info("534", TensorProto.FLOAT, shape=["batch_size", 16, 112, 112]),
+            helper.make_tensor_value_info("247", TensorProto.FLOAT, shape=[]),
+            helper.make_tensor_value_info("248", TensorProto.FLOAT, shape=["batch_size", 16, 112, 112]),
+            helper.make_tensor_value_info("249", TensorProto.FLOAT, shape=[]),
+            helper.make_tensor_value_info("250", TensorProto.FLOAT, shape=[]),
+            helper.make_tensor_value_info("251", TensorProto.FLOAT, shape=["batch_size", 16, 112, 112]),
+            helper.make_tensor_value_info("252", TensorProto.FLOAT, shape=[]),
+            helper.make_tensor_value_info("253", TensorProto.FLOAT, shape=["batch_size", 16, 112, 112]),
+            helper.make_tensor_value_info("254", TensorProto.FLOAT, shape=["batch_size", 16, 112, 112]),
+        ],
+        nodes=[
+            make_node(
+                "Conv",
+                inputs=["input", "535", "536"],
+                outputs=["534"],
+                name="Conv_0",
+                dilations=[1, 1],
+                group=1,
+                kernel_shape=[3, 3],
+                pads=[1, 1, 1, 1],
+                strides=[2, 2],
+            ),
+            make_node(
+                "Constant",
+                inputs=[],
+                outputs=["247"],
+                name="Constant_1",
+                value=numpy_helper.from_array(np.array(3.0, dtype="float32"), name=""),
+            ),
+            make_node("Add", inputs=["534", "247"], outputs=["248"], name="Add_2"),
+            make_node(
+                "Constant",
+                inputs=[],
+                outputs=["249"],
+                name="Constant_3",
+                value=numpy_helper.from_array(np.array(0.0, dtype="float32"), name=""),
+            ),
+            make_node(
+                "Constant",
+                inputs=[],
+                outputs=["250"],
+                name="Constant_4",
+                value=numpy_helper.from_array(np.array(6.0, dtype="float32"), name=""),
+            ),
+            make_node("Clip", inputs=["248", "249", "250"], outputs=["251"], name="Clip_5"),
+            make_node(
+                "Constant",
+                inputs=[],
+                outputs=["252"],
+                name="Constant_6",
+                value=numpy_helper.from_array(np.array(6.0, dtype="float32"), name=""),
+            ),
+            make_node("Div", inputs=["251", "252"], outputs=["253"], name="Div_7"),
+            make_node("Mul", inputs=["534", "253"], outputs=["254"], name="Mul_8"),
+        ],
+    ),
+)
+
+if __name__ == "__main__" and len(sys.argv) == 2:
+    _, out_path = sys.argv
+    onnx.save(model, out_path)
diff --git a/onnxruntime/test/testdata/mobilenet_v3_small_excerpt_gen/const0_535.npy b/onnxruntime/test/testdata/mobilenet_v3_small_excerpt_gen/const0_535.npy
new file mode 100644
index 0000000000..b900727402
Binary files /dev/null and b/onnxruntime/test/testdata/mobilenet_v3_small_excerpt_gen/const0_535.npy differ
diff --git a/onnxruntime/test/testdata/mobilenet_v3_small_excerpt_gen/const1_536.npy b/onnxruntime/test/testdata/mobilenet_v3_small_excerpt_gen/const1_536.npy
new file mode 100644
index 0000000000..f7c79de131
Binary files /dev/null and b/onnxruntime/test/testdata/mobilenet_v3_small_excerpt_gen/const1_536.npy differ
diff --git a/onnxruntime/test/util/include/current_test_name.h b/onnxruntime/test/util/include/current_test_name.h
new file mode 100644
index 0000000000..faecd3ecfc
--- /dev/null
+++ b/onnxruntime/test/util/include/current_test_name.h
@@ -0,0 +1,21 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+
+#include "gtest/gtest.h"
+
+namespace onnxruntime::test {
+
+// Returns the current test's name ("<test suite name>.<test name>") if a test is running, or "unknown".
+inline std::string CurrentTestName() {
+  const auto* const test_info = testing::UnitTest::GetInstance()->current_test_info();
+  if (test_info == nullptr) {
+    return "unknown";
+  }
+  return std::string{test_info->test_suite_name()} + "." + test_info->name();
+}
+
+}  // namespace onnxruntime::test
diff --git a/onnxruntime/test/util/include/test_utils.h b/onnxruntime/test/util/include/test_utils.h
index 2c20177cd3..83eb4f59bd 100644
--- a/onnxruntime/test/util/include/test_utils.h
+++ b/onnxruntime/test/util/include/test_utils.h
@@ -3,15 +3,17 @@
 
 #pragma once
 
-#include "core/framework/framework_common.h"
-#include "core/framework/execution_provider.h"
-#include "core/framework/ort_value.h"
-#include "core/providers/cpu/cpu_execution_provider.h"
-
 #include <memory>
+#include <string_view>
 #include <string>
 #include <vector>
 
+#include "core/common/gsl.h"
+#include "core/framework/execution_provider.h"
+#include "core/framework/framework_common.h"
+#include "core/framework/ort_value.h"
+#include "core/providers/cpu/cpu_execution_provider.h"
+
 namespace onnxruntime {
 class Graph;
 
@@ -44,14 +46,14 @@ int CountAssignedNodes(const Graph& current_graph, const std::string& ep_type);
 // Run the model using the CPU EP to get expected output, comparing to the output when the 'execution_provider'
 // is enabled. requires that at least one node is assigned to 'execution_provider'
 void RunAndVerifyOutputsWithEP(const ORTCHAR_T* model_path,
-                               const char* log_id,
+                               std::string_view log_id,
                                std::unique_ptr<IExecutionProvider> execution_provider,
                                const NameMLValMap& feeds,
                                const EPVerificationParams& params = EPVerificationParams());
 
 // A helper function that takes in model_data
 void RunAndVerifyOutputsWithEP(const std::string& model_data,
-                               const char* log_id,
+                               std::string_view log_id,
                                std::unique_ptr<IExecutionProvider> execution_provider,
                                const NameMLValMap& feeds,
                                const EPVerificationParams& params = EPVerificationParams());
@@ -66,8 +68,8 @@ void CheckShapeEquality(const ONNX_NAMESPACE::TensorShapeProto* shape1,
 
 // Create OrtValue on CPU copying from provided inputs.
 template <typename T>
-void CreateInputOrtValueOnCPU(gsl::span<const int64_t> dims, const std::vector<T>& value,
-                              OrtValue* p_ortvalue, AllocatorPtr alloc = nullptr) {
+OrtValue CreateInputOrtValueOnCPU(gsl::span<const int64_t> dims, gsl::span<const T> value,
+                                  AllocatorPtr alloc = nullptr) {
   static CPUExecutionProviderInfo info;
   static CPUExecutionProvider cpu_provider(info);
   static AllocatorPtr cpu_allocator = cpu_provider.CreatePreferredAllocators()[0];
@@ -82,9 +84,11 @@ void CreateInputOrtValueOnCPU(gsl::span<const int64_t> dims, const std::vector<T
     memcpy(p_tensor->MutableDataRaw(), value.data(), p_tensor->SizeInBytes());
   }
 
-  p_ortvalue->Init(p_tensor.release(),
-                   DataTypeImpl::GetType<Tensor>(),
-                   DataTypeImpl::GetType<Tensor>()->GetDeleteFunc());
+  OrtValue ort_value;
+  ort_value.Init(p_tensor.release(),
+                 DataTypeImpl::GetType<Tensor>(),
+                 DataTypeImpl::GetType<Tensor>()->GetDeleteFunc());
+  return ort_value;
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/util/test_utils.cc b/onnxruntime/test/util/test_utils.cc
index 2f67fea9f6..1d38aea910 100644
--- a/onnxruntime/test/util/test_utils.cc
+++ b/onnxruntime/test/util/test_utils.cc
@@ -83,7 +83,7 @@ int CountAssignedNodes(const Graph& current_graph, const std::string& ep_type) {
   return count;
 }
 
-void RunAndVerifyOutputsWithEP(const ORTCHAR_T* model_path, const char* log_id,
+void RunAndVerifyOutputsWithEP(const ORTCHAR_T* model_path, std::string_view log_id,
                                std::unique_ptr<IExecutionProvider> execution_provider,
                                const NameMLValMap& feeds,
                                const EPVerificationParams& params) {
@@ -93,7 +93,7 @@ void RunAndVerifyOutputsWithEP(const ORTCHAR_T* model_path, const char* log_id,
   RunAndVerifyOutputsWithEP(model_data, log_id, std::move(execution_provider), feeds, params);
 }
 
-void RunAndVerifyOutputsWithEP(const std::string& model_data, const char* log_id,
+void RunAndVerifyOutputsWithEP(const std::string& model_data, std::string_view log_id,
                                std::unique_ptr<IExecutionProvider> execution_provider,
                                const NameMLValMap& feeds,
                                const EPVerificationParams& params) {
diff --git a/orttraining/orttraining/test/training_api/core/data_utils.h b/orttraining/orttraining/test/training_api/core/data_utils.h
index d15e3845f9..7724bc0c26 100644
--- a/orttraining/orttraining/test/training_api/core/data_utils.h
+++ b/orttraining/orttraining/test/training_api/core/data_utils.h
@@ -57,7 +57,7 @@ inline void GenerateRandomInput(gsl::span<const int64_t> dims, OrtValue& input)
   TensorShape shape(dims);
   std::vector<float> data(shape.Size());
   GenerateRandomData(data);
-  onnxruntime::test::CreateInputOrtValueOnCPU<float>(dims, data, &input);
+  input = onnxruntime::test::CreateInputOrtValueOnCPU<float>(dims, data);
 }
 
 }  // namespace onnxruntime::training::test
diff --git a/orttraining/orttraining/test/training_api/core/training_api_tests.cc b/orttraining/orttraining/test/training_api/core/training_api_tests.cc
index 8c6b8ddb2a..49b6e9ef2c 100644
--- a/orttraining/orttraining/test/training_api/core/training_api_tests.cc
+++ b/orttraining/orttraining/test/training_api/core/training_api_tests.cc
@@ -208,8 +208,8 @@ TEST(TrainingApiTest, ModuleTrainStep) {
   ASSERT_EQ(model->GetTrainingModelOutputCount(), 1);
   OrtValue input, target;
   GenerateRandomInput(std::array<int64_t, 2>{2, 784}, input);
-  onnxruntime::test::CreateInputOrtValueOnCPU<int32_t>(
-      std::array<int64_t, 1>{2}, std::vector<int32_t>(2, 1), &target);
+  target = onnxruntime::test::CreateInputOrtValueOnCPU<int32_t>(
+      std::array<int64_t, 1>{2}, std::vector<int32_t>(2, 1));
   auto data_loader = std::vector<std::vector<OrtValue>>(4, std::vector<OrtValue>{input, target});
 
   size_t step = 0;
@@ -326,8 +326,8 @@ void TestLRSchduler(const std::basic_string<ORTCHAR_T>& test_file_name,
 
     OrtValue input, target;
     GenerateRandomInput(std::array<int64_t, 2>{2, 784}, input);
-    onnxruntime::test::CreateInputOrtValueOnCPU<int32_t>(
-        std::array<int64_t, 1>{2}, std::vector<int32_t>(2, 1), &target);
+    target = onnxruntime::test::CreateInputOrtValueOnCPU<int32_t>(
+        std::array<int64_t, 1>{2}, std::vector<int32_t>(2, 1));
 
     /// Load test data for learning rate schedulers.
     auto data_uri = ORT_TSTR("testdata/test_data_generation/lr_scheduler/" + test_file_name);
@@ -452,8 +452,8 @@ TEST(TrainingApiTest, OptimStep) {
 
   OrtValue input, target;
   GenerateRandomInput(std::array<int64_t, 2>{2, 784}, input);
-  onnxruntime::test::CreateInputOrtValueOnCPU<int32_t>(
-      std::array<int64_t, 1>{2}, std::vector<int32_t>(2, 1), &target);
+  target = onnxruntime::test::CreateInputOrtValueOnCPU<int32_t>(
+      std::array<int64_t, 1>{2}, std::vector<int32_t>(2, 1));
   auto data_loader = std::vector<std::vector<OrtValue>>(4, std::vector<OrtValue>{input, target});
 
   size_t step = 0;