CoreML: Add ML Program ConvTranspose (#21416)

### Description  Add ML Program ConvTranspose - some limitations to simplify the implementation for now - some limitations due to flaky CoreML output Added support for non-contiguous MLMultiArray output as we see that with some unit tests when the CPU-only flag is not set (e.g. innermost dim has min size of 16 but test output only has 8 values). - support only one non-contiguous dim to keep it simple - manually tested as we don't have a setup that can test objective-c code - test code is in model.mm and can be enabled via ifdef if we need to validate any future changes ### Motivation and Context  Address operator gaps in high priority model. --------- Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
2026-07-09 17:28:58 +00:00 · 2024-07-24 16:08:20 +10:00 · 2024-07-24 16:08:20 +10:00 · 2580d935cb
commit 2580d935cb
parent 6794dfd941
15 changed files with 506 additions and 160 deletions
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@ -679,7 +679,10 @@ if(onnxruntime_USE_RKNPU)
 endif()

 if(onnxruntime_USE_COREML)
-  list(APPEND onnxruntime_test_framework_src_patterns  ${TEST_SRC_DIR}/providers/coreml/*)
+  list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/coreml/*.cc)
+  if(APPLE)
+    list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/coreml/*.mm)
+  endif()
  list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_coreml coreml_proto)
  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_coreml coreml_proto)
  list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_coreml coreml_proto)
--- a/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc
@ -0,0 +1,218 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/common.h"
+#include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/builders/model_builder.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/shared/utils/utils.h"
+
+using namespace CoreML::Specification;
+
+namespace onnxruntime {
+namespace coreml {
+
+class ConvTransposeOpBuilder : public BaseOpBuilder {
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
+
+  bool IsOpSupportedImpl(const Node& /* node */, const OpBuilderInputParams& /* input_params */,
+                         const logging::Logger& /* logger */) const override;
+
+  bool SupportsMLProgram() const override { return true; }
+};
+
+Status ConvTransposeOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder,
+                                                     [[maybe_unused]] const Node& node,
+                                                     const logging::Logger& /*logger*/) const {
+#if defined(COREML_ENABLE_MLPROGRAM)
+  using namespace CoreML::Specification::MILSpec;  // NOLINT
+  const auto input_defs = node.InputDefs();
+  const auto output_defs = node.OutputDefs();
+  const auto& input_name = input_defs[0]->Name();
+
+  NodeAttrHelper helper(node);
+
+  // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.conv.conv_transpose
+  std::unique_ptr<Operation> op = model_builder.CreateOperation(node, "conv_transpose");
+  const auto& op_type = op->type();
+
+  AddOperationInput(*op, "x", input_name);
+  AddOperationInput(*op, "weight", input_defs[1]->Name());
+
+  if (input_defs.size() > 2) {
+    AddOperationInput(*op, "bias", input_defs[2]->Name());
+  }
+
+  // we know this input has a valid shape due to the check in IsOpSupportedImpl. ignore N and C dims.
+  const auto num_spatial_dims = input_defs[1]->Shape()->dim_size() - 2;
+
+  // Spec says strides/dilations/pads are optional but reality is they're required for at least the iOS15 target
+  // which is CoreML5. Due to that we just add everything for simplicity.
+  const auto strides = helper.Get("strides", std::vector<int64_t>(num_spatial_dims, 1));
+  const auto dilations = helper.Get("dilations", std::vector<int64_t>(num_spatial_dims, 1));
+
+  AddOperationInput(*op, "strides", model_builder.AddConstant(op_type, "strides", strides));
+  AddOperationInput(*op, "dilations", model_builder.AddConstant(op_type, "dilations", dilations));
+
+  const std::optional<int64_t> groups = helper.GetInt64("group");
+  if (groups) {
+    AddOperationInput(*op, "groups", model_builder.AddScalarConstant(op_type, "groups", *groups));
+  }
+
+  // if we can enable output_shape, this code works. see IsOpSupportedImpl for the reason it's disabled.
+  // const auto output_shape = helper.GetInt64s("output_shape");
+  // if (output_shape) {
+  //  AddOperationInput(*op, "output_shape", model_builder.AddConstant(op_type, "output_shape", *output_shape));
+  //  // these are required despite the spec saying otherwise
+  //  AddOperationInput(*op, "pad_type", model_builder.AddScalarConstant(op_type, "pad_type", std::string("valid")));
+  //  std::vector<int64_t> pads(num_spatial_dims * 2, 0);
+  //  AddOperationInput(*op, "pad", model_builder.AddConstant(op_type, "pad", pads));
+  //} else {
+  //  AddPadTypeAndPads(*op, model_builder, op_type, helper, num_spatial_dims);
+  //}
+
+  AddPadTypeAndPads(*op, model_builder, op_type, helper, num_spatial_dims);
+
+  AddOperationOutput(*op, *output_defs[0]);
+
+  model_builder.AddOperation(std::move(op));
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+
+  return Status::OK();
+}
+
+bool ConvTransposeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
+                                               const logging::Logger& logger) const {
+  if (!input_params.create_mlprogram) {
+    LOGS(logger, VERBOSE) << "ConvTranspose: ML Program required";
+    return false;
+  }
+
+  // ML Program
+  // - const weight until CoreML7 (iOS17)
+  //   - require constant for now as non-const would be unusual and we rely on the shape of W to be known to validate
+  //     the kernel_shape can be used
+  // - const bias
+  // - const pad
+  //   - if auto_pad is same_upper or same_lower the output[i] - (input[i] * strides[i]) must be divisible by 2
+  //     as the pads must be equally split as there's no upper/lower option in CoreML
+  //     - punting on supporting this for now
+  //   - must be symmetric for CoreML to do the right thing
+  // - const strides/dilations/groups
+  // - output_shape CoreML output is inconsistent so disabled for now
+  //
+  // NOTE: need to test with/without the COREML_FLAG_USE_CPU_ONLY flag being set to get an idea of how flaky the CoreML
+  // behavior is.
+  // Update /onnxruntime/test/util/default_providers.cc:DefaultCoreMLExecutionProvider to do so
+
+  const auto& input_defs = node.InputDefs();
+
+  std::vector<int64_t> input_shape;
+  if (!GetShape(*input_defs[0], input_shape, logger)) {
+    // requires the rank at least to be known
+    LOGS(logger, VERBOSE) << "ConvTranspose: failed to get input shape";
+    return false;
+  }
+
+  // for simplicity require weight to be constant
+  const auto& weight_arg = *input_defs[1];
+  const auto& weight_name = input_defs[1]->Name();
+  const auto* weight = input_params.graph_viewer.GetConstantInitializer(weight_name);
+  if (!weight) {
+    LOGS(logger, VERBOSE) << "ConvTranspose: weight must be constant";
+    return false;
+  }
+
+  if (input_defs.size() > 2 && !input_params.graph_viewer.GetConstantInitializer(input_defs[2]->Name())) {
+    LOGS(logger, VERBOSE) << "ConvTranspose: bias must be constant";
+    return false;
+  }
+
+  std::vector<int64_t> weight_shape;
+  if (!GetShape(weight_arg, weight_shape, logger)) {
+    // impossible as it's a constant initializer
+    LOGS(logger, VERBOSE) << "ConvTranspose: failed to get weight shape";
+    return false;
+  }
+
+  int64_t num_spatial_dims = narrow<int64_t>(weight_shape.size()) - 2;
+
+  NodeAttrHelper helper(node);
+
+  // Punt on SAME_UPPER/SAME_LOWER for now.
+  // We could infer that 'same' -> 'same_upper' based on the CoreML conv spec having 'same' and 'same_lower' but
+  // need to validate that assertion.
+  // Additionally, if the pads size is equal, there's no difference between same_upper and same_lower.
+  // To do that we'd need the 'output_shape' attribute to check against.
+  // Can add this handling if/when needed.
+  auto autopad = StringToAutoPadType(helper.Get("auto_pad", "NOTSET"));
+  if (autopad == AutoPadType::SAME_LOWER || autopad == AutoPadType::SAME_UPPER) {
+    LOGS(logger, VERBOSE) << "ConvTranspose: support for SAME_LOWER/SAME_UPPER is not implemented yet";
+    return false;
+  } else if (autopad == AutoPadType::NOTSET) {
+    // CoreML output is inconsistent between CPU_ONLY and ALL if the pads aren't all the same value.
+    // CPU matches the expected output, but other devices don't seem to (at least on macOS).
+    auto onnx_pads = *helper.GetInt64s("pads");  // 'pads' are required if auto_pad is NOTSET
+    const auto pad_value = onnx_pads[0];
+    if (!std::all_of(onnx_pads.begin() + 1, onnx_pads.end(),
+                     [pad_value](auto value) { return value == pad_value; })) {
+      LOGS(logger, VERBOSE) << "ConvTranspose: all pad values must be the same for CoreML to return "
+                               "consistent results";
+      return false;
+    }
+  }
+
+  // there's no input to specify a kernel shape in CoreML.
+  // it's OK if a specified kernel_shape matches kH and kW dims of the weight input.
+  auto kernel_shape = helper.GetInt64s("kernel_shape");
+  if (kernel_shape) {
+    bool valid = true;
+
+    if (static_cast<int64_t>(kernel_shape->size()) == num_spatial_dims) {
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        // check the specified kernel shape matches the weight shape. skip the initial N and C dims in the latter.
+        if ((*kernel_shape)[i] != weight_shape[i + 2]) {
+          valid = false;
+          break;
+        }
+      }
+    } else {
+      valid = false;
+    }
+
+    if (!valid) {
+      LOGS(logger, VERBOSE) << "ConvTranspose: kernel_shape attribute does not match the weight shape";
+      return false;
+    }
+  }
+
+  // In theory this can be supported, but running with COREML_FLAG_USE_CPU_ONLY produces output that doesn't match
+  // ONNX. Running without that flag produces the expected output. Madness...
+  auto output_shape = helper.GetInt64s("output_shape");
+  if (output_shape) {
+    LOGS(logger, VERBOSE) << "ConvTranspose: output_shape is not supported as the CoreML output is inconsistent";
+    return false;
+  }
+
+  // output_padding, if specified, must be the default value of all zeros as there's no equivalent in CoreML.
+  auto output_padding = helper.GetInt64s("output_padding");
+  if (output_padding &&
+      std::any_of(output_padding->begin(), output_padding->end(), [](auto value) { return value != 0; })) {
+    LOGS(logger, VERBOSE) << "ConvTranspose: output_padding is not supported";
+    return false;
+  }
+
+  return true;
+}
+
+void CreateConvTransposeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.builders.push_back(std::make_unique<ConvTransposeOpBuilder>());
+  op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
+}
+
+}  // namespace coreml
+}  // namespace onnxruntime
--- a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
@ -427,13 +427,13 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPa
        auto h_in = input_shape[input_rank - 2];
        auto w_in = input_shape[input_rank - 1];

-        if (!utils::IsScalingByAFactorOfN(h_in, scale_h)) {
+        if (!utils::ReciprocalIsAFactorOfN(h_in, scale_h)) {
          LOGS(logger, VERBOSE) << "Resize: downsampling scale " << scale_h
                                << " is not a factor of input height: " << h_in;
          return false;
        }

-        if (!utils::IsScalingByAFactorOfN(w_in, scale_w)) {
+        if (!utils::ReciprocalIsAFactorOfN(w_in, scale_w)) {
          LOGS(logger, VERBOSE) << "Resize: downsampling scale " << scale_w
                                << " is not a factor of input width: " << w_in;
          return false;
--- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
@ -15,120 +15,56 @@ namespace coreml {
 static OpBuilderRegistrations CreateOpBuilderRegistrations() {
  OpBuilderRegistrations op_registrations;

-  {  // Add/Mul/Pow/Sub/Div
-    CreateBinaryOpBuilder("Add", op_registrations);
-    CreateBinaryOpBuilder("Mul", op_registrations);
-    CreateBinaryOpBuilder("Pow", op_registrations);
-    CreateBinaryOpBuilder("Sub", op_registrations);
-    CreateBinaryOpBuilder("Div", op_registrations);
-  }
+  // Unary ops
+  CreateUnaryOpBuilder("Sqrt", op_registrations);
+  CreateUnaryOpBuilder("Reciprocal", op_registrations);

-  {  // Activations
-    CreateActivationOpBuilder("Sigmoid", op_registrations);
-    CreateActivationOpBuilder("Tanh", op_registrations);
-    CreateActivationOpBuilder("Relu", op_registrations);
-    CreateActivationOpBuilder("PRelu", op_registrations);
-    CreateActivationOpBuilder("LeakyRelu", op_registrations);
-  }
+  // Binary elementwise ops
+  CreateBinaryOpBuilder("Add", op_registrations);
+  CreateBinaryOpBuilder("Mul", op_registrations);
+  CreateBinaryOpBuilder("Pow", op_registrations);
+  CreateBinaryOpBuilder("Sub", op_registrations);
+  CreateBinaryOpBuilder("Div", op_registrations);

-  {  // Transpose
-    CreateTransposeOpBuilder("Transpose", op_registrations);
-  }
+  // Activations
+  CreateActivationOpBuilder("Sigmoid", op_registrations);
+  CreateActivationOpBuilder("Tanh", op_registrations);
+  CreateActivationOpBuilder("Relu", op_registrations);
+  CreateActivationOpBuilder("PRelu", op_registrations);
+  CreateActivationOpBuilder("LeakyRelu", op_registrations);

-  {  // Conv
-    CreateConvOpBuilder("Conv", op_registrations);
-  }
+  // Pooling ops
+  CreatePoolOpBuilder("GlobalAveragePool", op_registrations);
+  CreatePoolOpBuilder("GlobalMaxPool", op_registrations);
+  CreatePoolOpBuilder("AveragePool", op_registrations);
+  CreatePoolOpBuilder("MaxPool", op_registrations);

-  {  // Batch Normalization
-    CreateBatchNormalizationOpBuilder("BatchNormalization", op_registrations);
-  }
+  // Reduction ops
+  CreateReductionOpBuilder("ReduceMean", op_registrations);
+  CreateReductionOpBuilder("ReduceSum", op_registrations);

-  {  // Reshape
-    CreateReshapeOpBuilder("Reshape", op_registrations);
-  }
-
-  {  // DepthToSpace
-    CreateDepthToSpaceOpBuilder("DepthToSpace", op_registrations);
-  }
-
-  {  // Pool
-    CreatePoolOpBuilder("GlobalAveragePool", op_registrations);
-    CreatePoolOpBuilder("GlobalMaxPool", op_registrations);
-    CreatePoolOpBuilder("AveragePool", op_registrations);
-    CreatePoolOpBuilder("MaxPool", op_registrations);
-  }
-
-  {  // Concat
-    CreateConcatOpBuilder("Concat", op_registrations);
-  }
-
-  {  // Resize
-    CreateResizeOpBuilder("Resize", op_registrations);
-  }
-
-  {  // Gemm/MatMul
-    CreateGemmOpBuilder("Gemm", op_registrations);
-    CreateGemmOpBuilder("MatMul", op_registrations);
-  }
-
-  {  // Clip
-    CreateClipOpBuilder("Clip", op_registrations);
-  }
-
-  {  // Squeeze
-    CreateSqueezeOpBuilder("Squeeze", op_registrations);
-  }
-
-  {  // ArgMax
-    CreateArgMaxOpBuilder("ArgMax", op_registrations);
-  }
-
-  {  // Cast
-    CreateCastOpBuilder("Cast", op_registrations);
-  }
-
-  {  // Flatten
-    CreateFlattenOpBuilder("Flatten", op_registrations);
-  }
-
-  {  // LRN
-    CreateLRNOpBuilder("LRN", op_registrations);
-  }
-
-  {  // Pad
-    CreatePadOpBuilder("Pad", op_registrations);
-  }
-
-  {  // Unary
-    CreateUnaryOpBuilder("Sqrt", op_registrations);
-    CreateUnaryOpBuilder("Reciprocal", op_registrations);
-  }
-
-  {  // Reduction
-     // ReduceMean is used in layer normalization which seems to be problematic in Python tests.
-    CreateReductionOpBuilder("ReduceMean", op_registrations);
-    CreateReductionOpBuilder("ReduceSum", op_registrations);
-  }
-
-  {  // Shape
-    CreateShapeOpBuilder("Shape", op_registrations);
-  }
-
-  {  // Gather
-    CreateGatherOpBuilder("Gather", op_registrations);
-  }
-
-  {  // Slice
-    CreateSliceOpBuilder("Slice", op_registrations);
-  }
-
-  {  // Softmax
-    CreateSoftmaxOpBuilder("Softmax", op_registrations);
-  }
-
-  {  // Split
-    CreateSplitOpBuilder("Split", op_registrations);
-  }
+  CreateArgMaxOpBuilder("ArgMax", op_registrations);
+  CreateBatchNormalizationOpBuilder("BatchNormalization", op_registrations);
+  CreateCastOpBuilder("Cast", op_registrations);
+  CreateClipOpBuilder("Clip", op_registrations);
+  CreateConcatOpBuilder("Concat", op_registrations);
+  CreateConvOpBuilder("Conv", op_registrations);
+  CreateConvTransposeOpBuilder("ConvTranspose", op_registrations);
+  CreateDepthToSpaceOpBuilder("DepthToSpace", op_registrations);
+  CreateFlattenOpBuilder("Flatten", op_registrations);
+  CreateGatherOpBuilder("Gather", op_registrations);
+  CreateGemmOpBuilder("Gemm", op_registrations);
+  CreateLRNOpBuilder("LRN", op_registrations);
+  CreateGemmOpBuilder("MatMul", op_registrations);
+  CreatePadOpBuilder("Pad", op_registrations);
+  CreateReshapeOpBuilder("Reshape", op_registrations);
+  CreateResizeOpBuilder("Resize", op_registrations);
+  CreateShapeOpBuilder("Shape", op_registrations);
+  CreateSliceOpBuilder("Slice", op_registrations);
+  CreateSplitOpBuilder("Split", op_registrations);
+  CreateSoftmaxOpBuilder("Softmax", op_registrations);
+  CreateSqueezeOpBuilder("Squeeze", op_registrations);
+  CreateTransposeOpBuilder("Transpose", op_registrations);

  CreateGridSampleOpBuilder("GridSample", op_registrations);

--- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
+++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
@ -24,6 +24,7 @@ void CreateCastOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_
 void CreateClipOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateConcatOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateConvOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+void CreateConvTransposeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateDepthToSpaceOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateFlattenOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateGatherOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
--- a/onnxruntime/core/providers/coreml/model/model.h
+++ b/onnxruntime/core/providers/coreml/model/model.h
@ -13,6 +13,10 @@
 #include "core/common/status.h"
 #include "core/platform/ort_mutex.h"

+#if defined(__OBJC__)
+@class MLMultiArray;
+#endif
+
 namespace onnxruntime {
 namespace coreml {

@ -32,6 +36,15 @@ using GetOutputTensorMutableRawDataFn = std::function<void*(const std::string& n
                                                            int32_t requested_onnx_tensor_element_type,
                                                            gsl::span<const int64_t> static_shape)>;

+#if defined(__OBJC__)
+// helper function that we unit test.
+// Handles an MLMultiArray that is contiguous, or has one non-contiguous dimension.
+// The output values can be used to copy the array data to a contiguous buffer.
+// Loop num_blocks times, copying block_size elements each time, moving stride elements between copies.
+// A contiguous array will have num_blocks == 1, block_size == total_size (i.e. can be copied in a single operation)
+Status GetMLMultiArrayCopyInfo(const MLMultiArray* array, int64_t& num_blocks, int64_t& block_size, int64_t& stride);
+#endif
+
 class Model {
 public:
  Model(const std::string& path,
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@ -174,51 +174,69 @@ Status CreateInputFeatureProvider(const std::unordered_map<std::string, OnnxTens
  return Status::OK();
 }

-bool IsArrayContiguous(const MLMultiArray* array) {
-  int64_t batch_stride = [array.strides[0] longLongValue];
-  const auto* shape = array.shape;
-  int64_t batch_elems = 1;
-  for (unsigned long i = 1; i < shape.count; i++) batch_elems *= [shape[i] longLongValue];
-  return batch_stride == batch_elems;
-}
-
 Status CopyMLMultiArrayBuffer(const void* mlmultiarray_buffer, void* tensor_buffer,
-                              const MLMultiArray* array_info,
-                              const OnnxTensorInfo* tensor_info,
-                              const std::optional<unsigned long> mlmultiarray_buffer_size) {
+                              const MLMultiArray* array,
+                              const int64_t num_blocks, const int64_t block_size, const int64_t stride,
+                              const OnnxTensorInfo* tensor_info) {
  if (mlmultiarray_buffer == nullptr) {
    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "mlmultiarray_buffer has no data");
  }

-  const size_t num_elements = array_info.count;
+  // total including non-contiguous space
+
+  int64_t array_total_elements = [array.strides[0] longLongValue] * [array.shape[0] longLongValue];
+  const int64_t num_elements = array.count;
+
+  ORT_RETURN_IF(array_total_elements != num_blocks * stride ||
+                    num_elements != num_blocks * block_size,
+                "MLMultiArray size does not match the copy info");
+
  const auto onnx_data_type = tensor_info->data_type;
  switch (onnx_data_type) {
    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
-      const auto output_data_byte_size = num_elements * sizeof(float);
-      ORT_RETURN_IF_NOT(!mlmultiarray_buffer_size || mlmultiarray_buffer_size == output_data_byte_size,
-                        "CoreML output buffer size and expected output size differ");
-      memcpy(tensor_buffer, mlmultiarray_buffer, output_data_byte_size);
+      const auto* src_buffer = static_cast<const float*>(mlmultiarray_buffer);
+      auto* dst_buffer = static_cast<float*>(tensor_buffer);
+      const auto block_byte_size = block_size * sizeof(float);
+
+      for (int64_t idx = 0; idx < num_blocks; ++idx) {
+        memcpy(dst_buffer, src_buffer, block_byte_size);
+        src_buffer += stride;
+        dst_buffer += block_size;
+      }
      break;
    }
    case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
-      const auto output_data_byte_size = num_elements * sizeof(int32_t);
-      ORT_RETURN_IF_NOT(!mlmultiarray_buffer_size || mlmultiarray_buffer_size == output_data_byte_size,
-                        "CoreML output buffer size and expected output size differ");
-      memcpy(tensor_buffer, mlmultiarray_buffer, output_data_byte_size);
+      const auto* src_buffer = static_cast<const int32_t*>(mlmultiarray_buffer);
+      auto* dst_buffer = static_cast<int32_t*>(tensor_buffer);
+      const auto block_byte_size = block_size * sizeof(int32_t);
+
+      for (int64_t idx = 0; idx < num_blocks; ++idx) {
+        memcpy(dst_buffer, src_buffer, block_byte_size);
+        src_buffer += stride;
+        dst_buffer += block_size;
+      }
+
      break;
    }
    // For this case, since Coreml Spec only uses int32 for model output while onnx provides
    // int64 for model output data type. We are doing a type casting (int32 -> int64) here
    // when copying the model to ORT
    case ONNX_NAMESPACE::TensorProto_DataType_INT64: {
-      ORT_RETURN_IF_NOT(array_info.dataType == MLMultiArrayDataTypeInt32,
-                        "CoreML output data type is not MLMultiArrayDataTypeInt32");
-      ORT_RETURN_IF_NOT(!mlmultiarray_buffer_size || mlmultiarray_buffer_size == num_elements * sizeof(int32_t),
-                        "CoreML output buffer size and expected output size differ");
-      const auto model_output_span = gsl::span{static_cast<const int32_t*>(mlmultiarray_buffer), num_elements};
-      const auto output_span = gsl::span{static_cast<int64_t*>(tensor_buffer), num_elements};
-      std::transform(model_output_span.begin(), model_output_span.end(), output_span.begin(),
-                     [](int32_t v) { return static_cast<int64_t>(v); });
+      ORT_RETURN_IF(array.dataType != MLMultiArrayDataTypeInt32,
+                    "CoreML output data type is not MLMultiArrayDataTypeInt32");
+
+      const int32_t* src_buffer = static_cast<const int32_t*>(mlmultiarray_buffer);
+      int64_t* dst_buffer = static_cast<int64_t*>(tensor_buffer);
+
+      for (int64_t idx = 0; idx < num_blocks; ++idx) {
+        auto input_span = gsl::span{src_buffer, static_cast<size_t>(block_size)};
+        auto output_span = gsl::span{dst_buffer, static_cast<size_t>(block_size)};
+        std::transform(input_span.begin(), input_span.end(), output_span.begin(),
+                       [](int32_t v) { return static_cast<int64_t>(v); });
+
+        src_buffer += stride;
+        dst_buffer += block_size;
+      }
      break;
    }
    default:
@ -250,8 +268,7 @@ NS_ASSUME_NONNULL_BEGIN
 - (Status)loadModel API_AVAILABLE_COREML3;
 - (Status)predict:(const std::unordered_map<std::string, OnnxTensorData>&)inputs
                  outputs:(const std::unordered_map<std::string, OnnxTensorInfo>&)outputs
-    getOutputTensorDataFn:(const GetOutputTensorMutableRawDataFn&)
-                              get_output_tensor_mutable_raw_data_fn
+    getOutputTensorDataFn:(const GetOutputTensorMutableRawDataFn&)get_output_tensor_mutable_raw_data_fn
    API_AVAILABLE_COREML3;

@property(nullable) MLModel* model API_AVAILABLE_COREML3;
@ -397,21 +414,27 @@ NS_ASSUME_NONNULL_BEGIN
                                 ") do not match");
        }

-        ORT_RETURN_IF_NOT(IsArrayContiguous(data),
-                          "Non-contiguous output MLMultiArray is not currently supported");
+        // support a non-contiguous array, provided only one dimension is not contiguous
+        int64_t num_blocks = 0;
+        int64_t block_size = 0;
+        int64_t stride = 0;
+
+        ORT_RETURN_IF_ERROR(GetMLMultiArrayCopyInfo(data, num_blocks, block_size, stride));
+
        __block Status copy_status;
        const auto* tensor_info = &output_tensor_info;
        // `getBytesWithHandler` replaces deprecated `.dataPointer` on new versions
        if (@available(macOS 12.3, iOS 15.4, *)) {
          [data getBytesWithHandler:^(const void* bytes, NSInteger size) {
-            copy_status = CopyMLMultiArrayBuffer(bytes, output_buffer, data, tensor_info, size);
+            copy_status = CopyMLMultiArrayBuffer(bytes, output_buffer, data,
+                                                 num_blocks, block_size, stride, tensor_info);
          }];
        } else {
-          // disable size check as old API does not return buffer length
-          copy_status = CopyMLMultiArrayBuffer(data.dataPointer, output_buffer, data, tensor_info, std::nullopt);
+          copy_status = CopyMLMultiArrayBuffer(data.dataPointer, output_buffer, data,
+                                               num_blocks, block_size, stride, tensor_info);
        }
-        if (!copy_status.IsOK())
-          return copy_status;
+
+        ORT_RETURN_IF_ERROR(copy_status);
      }
    }
  }
@ -431,6 +454,49 @@ NS_ASSUME_NONNULL_END
 namespace onnxruntime {
 namespace coreml {

+Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
+                               int64_t& num_blocks, int64_t& block_size, int64_t& stride) {
+  const auto* shape = array.shape;
+  const auto rank = shape.count;
+
+  int64_t array_total_elements = [array.strides[0] longLongValue] * [shape[0] longLongValue];
+
+  int64_t data_elems = 1;   // actual values
+  int64_t total_elems = 1;  // elems including empty slots if non-contiguous
+  for (unsigned long i = 1; i <= rank; i++) {
+    int64_t this_stride = [array.strides[rank - i] longLongValue];
+    if (this_stride != total_elems) {
+      // non-contiguous
+      if (block_size != 0) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                               "Multiple non-contiguous dimensions in MLMultiArray are not supported.");
+      }
+
+      block_size = data_elems;
+      stride = this_stride;
+    }
+
+    const auto elems_this_dim = [shape[rank - i] longLongValue];
+    data_elems *= elems_this_dim;
+    total_elems = elems_this_dim * this_stride;
+  }
+
+  if (block_size == 0) {
+    // all data is contiguous
+    block_size = data_elems;
+    stride = array_total_elements;
+    assert(block_size == stride);
+  }
+
+  num_blocks = data_elems / block_size;
+
+  ORT_ENFORCE(array_total_elements == total_elems, "Logic error calculating copy info");
+  ORT_ENFORCE(stride >= block_size, "Logic error calculating copy info");
+  ORT_ENFORCE(stride * num_blocks == total_elems, "Logic error calculating copy info");
+
+  return Status::OK();
+}
+
 // Internal Execution class
 // This class will bridge Model (c++) with CoreMLExecution (objective c++)
 class Execution {
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
@ -274,8 +274,8 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const N
          return false;
        }

-        if (!utils::IsScalingByAFactorOfN(h_in, scale_h) ||
-            !utils::IsScalingByAFactorOfN(w_in, scale_w)) {
+        if (!utils::ReciprocalIsAFactorOfN(h_in, scale_h) ||
+            !utils::ReciprocalIsAFactorOfN(w_in, scale_w)) {
          LOGS_DEFAULT(VERBOSE) << "Input size must be evenly divisible by output size when downsampling";
          return false;
        }
--- a/onnxruntime/core/providers/utils.cc
+++ b/onnxruntime/core/providers/utils.cc
@ -24,7 +24,7 @@ common::Status OutputOptionalWithoutDataHelper(const ONNX_NAMESPACE::TypeProto&
 }
 #endif

-bool IsScalingByAFactorOfN(int64_t n, float scale) {
+bool ReciprocalIsAFactorOfN(int64_t n, float scale) {
  bool is_factor = false;
  if (scale > 0.f && scale < 1.f) {
    const double factor = 1.0 / scale;
--- a/onnxruntime/core/providers/utils.h
+++ b/onnxruntime/core/providers/utils.h
@ -19,6 +19,6 @@ common::Status OutputOptionalWithoutDataHelper(const ONNX_NAMESPACE::TypeProto&
 /// Check if the reciprocal of 'scale' is a factor of 'n'.
 ///   e.g. a scale of 0.5 is 1/2, the reciprocal is 2, and 2 is a factor of any even number.
 /// </summary>
-bool IsScalingByAFactorOfN(int64_t n, float scale);
+bool ReciprocalIsAFactorOfN(int64_t n, float scale);
 }  // namespace utils
 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc
@ -24,7 +24,7 @@ Status ConvTranspose::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr
    const auto rank = orig_shape.NumDimensions();

    if (conv_transpose_attrs_.group > 1) {
-      // Xnnpack [G, Oc, H, W Ic/G]
+      // Xnnpack [G, Oc, H, W, Ic/G]
      // (ref: https://github.com/google/XNNPACK/blob/ecd8311c8fd3d9ab47edbc3df5f2b5de7dabe75f/test/deconvolution-operator-tester.h#L678)
      if (rank == 4) {
        // split C (dim 0) into {group, C/group}
--- a/onnxruntime/core/providers/xnnpack/tensor/resize.cc
+++ b/onnxruntime/core/providers/xnnpack/tensor/resize.cc
@ -85,8 +85,8 @@ bool Resize::IsOnnxNodeSupported(const NodeUnit& node_unit,

        float scale_h = scales[2];
        float scale_w = scales[3];
-        if (!utils::IsScalingByAFactorOfN(h_in, scale_h) ||
-            !utils::IsScalingByAFactorOfN(w_in, scale_w)) {
+        if (!utils::ReciprocalIsAFactorOfN(h_in, scale_h) ||
+            !utils::ReciprocalIsAFactorOfN(w_in, scale_w)) {
          break;
        }
      }
--- a/onnxruntime/test/providers/coreml/utils_test.mm
+++ b/onnxruntime/test/providers/coreml/utils_test.mm
@ -0,0 +1,108 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#import <CoreML/CoreML.h>
+
+#include "gtest/gtest.h"
+#include "gmock/gmock.h"
+
+#include "core/providers/coreml/model/model.h"
+#include "test/util/include/asserts.h"
+
+namespace onnxruntime {
+namespace test {
+namespace {
+auto ValidateGetInfo(MLMultiArray* array,
+                     int64_t expected_num_blocks, int64_t expected_block_size, int64_t expected_stride,
+                     bool expect_valid) {
+  int64_t num_blocks = 0;
+  int64_t block_size = 0;
+  int64_t stride = 0;
+  auto status = coreml::GetMLMultiArrayCopyInfo(array, num_blocks, block_size, stride);
+
+  if (!expect_valid) {
+    ASSERT_STATUS_NOT_OK(status);
+    return;
+  }
+
+  ASSERT_STATUS_OK(status);
+  ASSERT_EQ(num_blocks, expected_num_blocks);
+  ASSERT_EQ(block_size, expected_block_size);
+  ASSERT_EQ(stride, expected_stride);
+}
+}  // namespace
+
+TEST(CoreMLUtils, GetMLMultiArrayReadInfo) {
+  // fake pointer. we don't read any data but initWithDataPointer requires a non-null address
+  void* data = reinterpret_cast<void*>(0xfeedf00d);
+
+  // a dim is non-contiguous if the stride is > the total number of elements in its inner dimensions
+
+  // dim -1 with non-contiguous data. 1 element (as it's the inner-most dimension) but the stride is 2.
+  {
+    NSArray<NSNumber*>* shape = @[ @1, @1, @8, @8 ];
+    NSArray<NSNumber*>* strides = @[ @128, @128, @16, @2 ];
+
+    auto* array = [[MLMultiArray alloc] initWithDataPointer:data
+                                                      shape:shape
+                                                   dataType:MLMultiArrayDataTypeInt32
+                                                    strides:strides
+                                                deallocator:^(void* /* bytes */) {
+                                                }
+                                                      error:nil];
+    ValidateGetInfo(array, 64, 1, 2, true);
+  }
+
+  // dim -2 with non-contiguous data. 8 elements in the inner dimension but the stride is 16.
+  {
+    NSArray<NSNumber*>* shape = @[ @1, @1, @8, @8 ];
+    NSArray<NSNumber*>* strides = @[ @128, @128, @16, @1 ];
+
+    auto* array = [[MLMultiArray alloc] initWithDataPointer:data
+                                                      shape:shape
+                                                   dataType:MLMultiArrayDataTypeInt32
+                                                    strides:strides
+                                                deallocator:^(void* /* bytes */) {
+                                                }
+                                                      error:nil];
+    ValidateGetInfo(array, 8, 8, 16, true);
+  }
+
+  // dim -3 with non-contiguous data. 16 elements in the inner dimensions but stride is 24.
+  {
+    NSArray<NSNumber*>* shape = @[ @1, @2, @4, @4 ];
+    NSArray<NSNumber*>* strides = @[ @48, @24, @4, @1 ];
+
+    auto* array = [[MLMultiArray alloc] initWithDataPointer:data
+                                                      shape:shape
+                                                   dataType:MLMultiArrayDataTypeInt32
+                                                    strides:strides
+                                                deallocator:^(void* /* bytes */) {
+                                                }
+                                                      error:nil];
+
+    ValidateGetInfo(array, 2, 16, 24, true);
+  }
+
+  // two non-contiguous dims (dim -2 and dim -3)
+  // dim -2 has 4 elements in the inner dimension and stride of 8
+  // dim -3 has 32 elements in the inner dimensions (we need to include the empty elements from the non-contiguous data
+  // in dim -2) and stride of 48
+  {
+    // dim
+    NSArray<NSNumber*>* shape = @[ @1, @2, @4, @4 ];
+    NSArray<NSNumber*>* strides = @[ @96, @48, @8, @1 ];
+
+    auto* array = [[MLMultiArray alloc] initWithDataPointer:data
+                                                      shape:shape
+                                                   dataType:MLMultiArrayDataTypeInt32
+                                                    strides:strides
+                                                deallocator:^(void* /* bytes */) {
+                                                }
+                                                      error:nil];
+
+    ValidateGetInfo(array, 0, 0, 0, false);
+  }
+}
+}  // namespace test
+}  // namespace onnxruntime
--- a/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc
@ -27,7 +27,7 @@ void TestConvTransposeOpInitializer(const ConvTransposeOpAttributes& attributes,
                                    const vector<vector<int64_t>>& input_shapes,
                                    const std::initializer_list<float>& expected_output,
                                    const vector<int64_t>& expected_output_shape,
-                                    bool is_filter_initializer = false,
+                                    bool is_weight_and_bias_initializer = false,
                                    OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess,
                                    const std::string& err_str = "",
                                    const std::unordered_set<std::string>& excluded_provider_types = {kTensorrtExecutionProvider}) {
@ -58,10 +58,10 @@ void TestConvTransposeOpInitializer(const ConvTransposeOpAttributes& attributes,
  }

  ORT_ENFORCE(inputs.size() <= 3, "Our name array is only setup to handle 3 inputs");
-  const char* szNames[] = {"X", "W", "B"};
-  bool isInitializers[] = {false, is_filter_initializer, false};
+  const char* input_names[] = {"X", "W", "B"};
+  bool is_initializers[] = {false, is_weight_and_bias_initializer, is_weight_and_bias_initializer};
  for (size_t i = 0; i < inputs.size(); i++) {
-    test.AddInput<float>(szNames[i], input_shapes[i], inputs[i], isInitializers[i]);
+    test.AddInput<float>(input_names[i], input_shapes[i], inputs[i], is_initializers[i]);
  }
  test.AddOutput<float>("Y", expected_output_shape, expected_output);

--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@ -7,6 +7,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:AveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
 |ai.onnx:Clip||
 |ai.onnx:Conv|Only 1D/2D Conv is supported.<br/>Bias if provided must be constant.|
+|ai.onnx:ConvTranspose|Weight and bias must be constant.<br/>padding_type of SAME_UPPER/SAME_LOWER is not supported.<br/>kernel_shape must have default values.<br/>output_shape is not supported.<br/>output_padding must have default values.|
 |ai.onnx:Div||
 |ai.onnx:Gemm|Input B must be constant.|
 |ai.onnx:GlobalAveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|