Cast Op performance fix. (#6509)

Update CPU Cast implementation to fix performance regressions. Update Cast unit tests for more coverage.
2026-07-13 18:08:13 +00:00 · 2021-02-04 14:52:37 -08:00 · 2021-02-04 14:52:37 -08:00 · 318b82ca7e
commit 318b82ca7e
parent 2ef792ae6e
9 changed files with 348 additions and 396 deletions
--- a/include/onnxruntime/core/framework/data_types.h
+++ b/include/onnxruntime/core/framework/data_types.h
@ -59,50 +59,10 @@ struct MLFloat16 {
  explicit MLFloat16(uint16_t x) : val(x) {}
  explicit MLFloat16(float f);

-  // Taken from https://stackoverflow.com/a/60047308/12627730
-  float AsFloat(uint32_t x) const {
-    float out = 0.0f;
-    std::memcpy(&out, &x, sizeof(x));
-    return out;
-  }
-
-  // Taken from https://stackoverflow.com/a/60047308/12627730
-  uint32_t AsUint(float x) const {
-    uint32_t out = 0;
-    std::memcpy(&out, &x, sizeof(x));
-    return out;
-  }
-
-  float HalfToFloat(const uint16_t x) const {
-    uint16_t half = x;
-    if (endian::native == endian::big) {
-      // Taken from https://stackoverflow.com/a/2182184/12627730
-      half = (x >> 8) | (x << 8);
-    }
-
-    // Taken from https://stackoverflow.com/a/60047308/12627730
-    // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5,
-    // +-5.9604645E-8, 3.311 digits
-    const uint32_t e = (half & 0x7C00) >> 10;  // exponent
-    const uint32_t m = (half & 0x03FF) << 13;  // mantissa
-    // evil log2 bit hack to count leading zeros in denormalized format
-    const uint32_t v = AsUint(static_cast<float>(m)) >> 23;
-    uint32_t full = (half & 0x8000) << 16 | (e != 0) * ((e + 112) << 23 | m) |
-                    ((e == 0) & (m != 0)) * ((v - 37) << 23 | ((m << (150 - v)) & 0x007FE000));  // sign : normalized : denormalized
-
-    if (endian::native == endian::big) {
-      // Taken from https://stackoverflow.com/a/2182184/12627730
-      full = ((full >> 24) & 0xff) |       // move byte 3 to byte 0
-             ((full << 8) & 0xff0000) |    // move byte 1 to byte 2
-             ((full >> 8) & 0xff00) |      // move byte 2 to byte 1
-             ((full << 24) & 0xff000000);  // byte 0 to byte 3
-    }
-
-    return AsFloat(full);
-  }
+  float ToFloat() const;

  operator float() const {
-    return HalfToFloat(val);
+    return ToFloat();
  }
 };

--- a/include/onnxruntime/core/platform/threadpool.h
+++ b/include/onnxruntime/core/platform/threadpool.h
@ -281,7 +281,7 @@ class ThreadPool {
  /**
   * Tries to call the given function in parallel, with calls split into (num_batches) batches.
   *\param num_batches If it is zero, it will be replaced to the value of DegreeOfParallelism().
-   *\param fn A std::function or STL style functor with signature of "void f(int32_t);"
+   *\param fn A std::function or STL style functor with signature of "void f(std::ptrdiff_t);"
   * Pitfall: Caller should cap `num_batches` to a reasonable value based on the cost of `fn` and the value of `total`.
   *For example, if fn is as simple as: int sum=0; fn = [&](int i){sum +=i;} and `total` is 100, then num_batches should
   *be just 1.
--- a/onnxruntime/core/framework/data_types.cc
+++ b/onnxruntime/core/framework/data_types.cc
@ -25,6 +25,10 @@ namespace onnxruntime {

 MLFloat16::MLFloat16(float f) : val{math::floatToHalf(f)} {}

+float MLFloat16::ToFloat() const {
+  return math::halfToFloat(val);
+}
+
 // Return the MLDataType used for a generic Tensor
 template <>
 MLDataType DataTypeImpl::GetType<Tensor>() {
--- a/onnxruntime/core/optimizer/conv_activation_fusion.cc
+++ b/onnxruntime/core/optimizer/conv_activation_fusion.cc
@ -49,7 +49,7 @@ static bool GetClipConstantMinMax(const Graph& graph, const Node& node, float& m
          //  value = static_cast<float>(*i.data<double>());
          //  break;
          case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
-            value = math::halfToFloat(i.data<BFloat16>()->val);
+            value = math::halfToFloat(i.data<MLFloat16>()->val);
            break;
          default:
            ORT_THROW("Unexpected data type for Clip input of ", initializer->data_type());
--- a/onnxruntime/core/providers/cpu/tensor/cast_op.cc
+++ b/onnxruntime/core/providers/cpu/tensor/cast_op.cc
@ -2,8 +2,8 @@
 // Licensed under the MIT License.

 #include <cstddef>
-#include <iomanip>
-#include <sstream>
+#include <cstdio>
+#include <string>

 #include "boost/mp11.hpp"

@ -18,15 +18,13 @@
 #include "core/providers/op_kernel_type_control.h"
 #include "core/util/math_cpuonly.h"

+#include "Eigen/src/Core/arch/Default/BFloat16.h"
 #include "Eigen/src/Core/arch/Default/Half.h"

 #if defined(_M_AMD64)
 #include "core/mlas/inc/mlas.h"
 #endif

-using namespace ONNX_NAMESPACE;
-using namespace boost::mp11;
-
 namespace onnxruntime {

 namespace op_kernel_type_control {
@ -56,20 +54,15 @@ using EnabledSrcTypes = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST_ALL_OPSETS(kCpuExecu
 using EnabledDstTypes = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST_ALL_OPSETS(kCpuExecutionProvider, kOnnxDomain,
                                                                       Cast, Output, 0);

-using IndirectCastTypes = TypeList<MLFloat16, BFloat16>;
-
-template <typename Type>
-using IsDirectCastType = mp_not<mp_contains<IndirectCastTypes, Type>>;
-
-template <typename... Types>
-using AreAllDirectCastTypes = mp_all<IsDirectCastType<Types>...>;
-
 // string cast helpers
+// Note: when C++17 is available, use <charconv> functions

-// handle floating point input separately
+// handle floating point output separately
 template <typename SrcType>
 typename std::enable_if<std::is_floating_point<SrcType>::value, void>::type
 CastToString(const SrcType& input, std::string& output) {
+  static_assert(sizeof(SrcType) <= sizeof(double),
+                "largest supported floating point type is double");
  if (std::isnan(input)) {
    output = "NaN";
  } else if (std::isinf(input)) {
@ -79,19 +72,49 @@ CastToString(const SrcType& input, std::string& output) {
      output = "INF";
    }
  } else {
-    // setprecision to 8 to match numpy default behavior
-    std::ostringstream convert;
-    convert << std::setprecision(8) << input;
-    output = convert.str();
+    // set precision to 8 to match numpy default behavior
+    constexpr const char* format = "%.8g";
+    const double value = static_cast<double>(input);
+
+    char static_buffer[256];
+    std::unique_ptr<char[]> dynamic_buffer{};
+
+    gsl::span<char> buffer_span = gsl::make_span(static_buffer);
+
+    auto snprintf_result = std::snprintf(buffer_span.data(), buffer_span.size(), format, value);
+    ORT_ENFORCE(snprintf_result > 0, "snprintf() failed with return value: ", snprintf_result);
+
+    // include trailing '\0'
+    const size_t required_buffer_size = gsl::narrow_cast<size_t>(snprintf_result) + 1;
+
+    if (required_buffer_size > buffer_span.size()) {
+      // didn't get it all, allocate a bigger buffer and retry
+      dynamic_buffer = onnxruntime::make_unique<char[]>(required_buffer_size);
+      buffer_span = gsl::make_span(dynamic_buffer.get(), required_buffer_size);
+      snprintf_result = std::snprintf(buffer_span.data(), buffer_span.size(), format, value);
+      ORT_ENFORCE(
+          snprintf_result > 0 &&
+              gsl::narrow_cast<size_t>(snprintf_result) == buffer_span.size() - 1,
+          "Failed to write value with snprintf().");
+    }
+
+    output.assign(buffer_span.data(), required_buffer_size - 1);
  }
 }

 template <typename SrcType>
 typename std::enable_if<!std::is_floating_point<SrcType>::value, void>::type
 CastToString(const SrcType& input, std::string& output) {
-  std::ostringstream convert;
-  convert << input;
-  output = convert.str();
+  output = std::to_string(input);
+}
+
+// overloads for MLFloat16 and BFloat16
+void CastToString(const MLFloat16& input, std::string& output) {
+  CastToString(static_cast<float>(input), output);
+}
+
+void CastToString(const BFloat16& input, std::string& output) {
+  CastToString(static_cast<float>(input), output);
 }

 template <typename DstType>
@ -118,115 +141,121 @@ CastFromString(const std::string& input, DstType& output) {
  output = gsl::narrow_cast<DstType>(std::stoll(input));
 }

-// generic scalar X -> Y
-template <typename SrcType, typename DstType>
-struct ScalarDirectCaster {
-  void Cast(const SrcType& in, DstType& out) const {
-    out = static_cast<DstType>(in);
-  }
+// overloads for MLFloat16 and BFloat16
+void CastFromString(const std::string& input, MLFloat16& output) {
+  float intermediate;
+  CastFromString(input, intermediate);
+  output = static_cast<MLFloat16>(intermediate);
+}
+
+void CastFromString(const std::string& input, BFloat16& output) {
+  float intermediate;
+  CastFromString(input, intermediate);
+  output = static_cast<BFloat16>(intermediate);
+}
+
+// type that is usable with Eigen cast
+template <typename T>
+struct EigenCastType {
+  using type = T;
 };

-// scalar X -> string
-template <typename SrcType>
-struct ScalarDirectCaster<SrcType, std::string> {
-  void Cast(const SrcType& in, std::string& out) const {
-    CastToString<SrcType>(in, out);
-  }
+// ORT float16 types don't support casting, so map them to Eigen ones
+
+template <>
+struct EigenCastType<MLFloat16> {
+  using type = Eigen::half;
 };

-// scalar string -> X
-template <typename DstType>
-struct ScalarDirectCaster<std::string, DstType> {
-  void Cast(const std::string& in, DstType& out) const {
-    CastFromString<DstType>(in, out);
-  }
-};
-
-// helper for indirect cast types
-template <typename SrcType, typename DstType, typename IntermediateType>
-struct ScalarIndirectCaster {
-  void Cast(const SrcType& in, DstType& out) const {
-    IntermediateType intermediate;
-    ScalarDirectCaster<SrcType, IntermediateType>{}.Cast(in, intermediate);
-    ScalarDirectCaster<IntermediateType, DstType>{}.Cast(intermediate, out);
-  }
-};
-
-template <typename SrcType, typename DstType, class Enable = void>
-struct ScalarCaster;
-
-template <typename SrcType, typename DstType>
-struct ScalarCaster<
-    SrcType, DstType,
-    typename std::enable_if<AreAllDirectCastTypes<SrcType, DstType>::value>::type> {
-  void Cast(const SrcType& in, DstType& out) const {
-    ScalarDirectCaster<SrcType, DstType>{}.Cast(in, out);
-  }
-};
-
-template <typename SrcType, typename DstType>
-struct ScalarCaster<
-    SrcType, DstType,
-    typename std::enable_if<!AreAllDirectCastTypes<SrcType, DstType>::value>::type> {
-  void Cast(const SrcType& in, DstType& out) const {
-    ScalarIndirectCaster<SrcType, DstType, float>{}.Cast(in, out);
-  }
+template <>
+struct EigenCastType<BFloat16> {
+  using type = Eigen::bfloat16;
 };

 // generic tensor X -> Y
-template <typename SrcType, typename DstType>
+template <typename SrcType, typename DstType, typename Enable = void>
 struct TensorCaster {
-  void Cast(const Tensor& in, Tensor& out, const TensorShape& shape) const {
+  void Cast(const OpKernelContext&, const Tensor& in, Tensor& out, const TensorShape& shape) const {
+    using SrcEigenCastType = typename EigenCastType<SrcType>::type;
+    using DstEigenCastType = typename EigenCastType<DstType>::type;
+
    const std::ptrdiff_t shape_size = gsl::narrow<std::ptrdiff_t>(shape.Size());
-    const auto in_vector = ConstEigenVectorMap<SrcType>(in.Data<SrcType>(), shape_size);
-    auto out_vector = EigenVectorMap<DstType>(out.MutableData<DstType>(), shape_size);
-    out_vector = in_vector.unaryExpr([](const SrcType& in_scalar) {
-      DstType out_scalar;
-      ScalarCaster<SrcType, DstType>{}.Cast(in_scalar, out_scalar);
-      return out_scalar;
-    });
+    const auto in_vector =
+        ConstEigenVectorMap<SrcEigenCastType>(reinterpret_cast<const SrcEigenCastType*>(in.Data<SrcType>()), shape_size);
+    auto out_vector =
+        EigenVectorMap<DstEigenCastType>(reinterpret_cast<DstEigenCastType*>(out.MutableData<DstType>()), shape_size);
+    out_vector = in_vector.template cast<DstEigenCastType>();
  }
 };

-template <typename SrcType, typename DstType>
-void CastStringTensor(const Tensor& in, Tensor& out, const TensorShape& shape) {
-  static_assert(std::is_same<SrcType, std::string>::value || std::is_same<DstType, std::string>::value,
-                "Either SrcType or DstType must be std::string.");
-  const std::ptrdiff_t shape_size = gsl::narrow<std::ptrdiff_t>(shape.Size());
-  const auto in_data = in.DataAsSpan<SrcType>();
-  const auto out_data = out.MutableDataAsSpan<DstType>();
-  for (std::ptrdiff_t i = 0; i < shape_size; ++i) {
-    ScalarCaster<SrcType, DstType>{}.Cast(in_data[i], out_data[i]);
-  }
-}
-
 // tensor X -> string
 template <typename SrcType>
 struct TensorCaster<SrcType, std::string> {
-  void Cast(const Tensor& in, Tensor& out, const TensorShape& shape) const {
-    CastStringTensor<SrcType, std::string>(in, out, shape);
+  void Cast(const OpKernelContext&, const Tensor& in, Tensor& out, const TensorShape& shape) const {
+    const std::ptrdiff_t shape_size = gsl::narrow<std::ptrdiff_t>(shape.Size());
+    const auto* in_data = in.Data<SrcType>();
+    auto* out_data = out.MutableData<std::string>();
+    for (std::ptrdiff_t i = 0; i < shape_size; ++i) {
+      CastToString(in_data[i], out_data[i]);
+    }
  }
 };

 // tensor string -> X
 template <typename DstType>
 struct TensorCaster<std::string, DstType> {
-  void Cast(const Tensor& in, Tensor& out, const TensorShape& shape) const {
-    CastStringTensor<std::string, DstType>(in, out, shape);
+  void Cast(const OpKernelContext&, const Tensor& in, Tensor& out, const TensorShape& shape) const {
+    const std::ptrdiff_t shape_size = gsl::narrow<std::ptrdiff_t>(shape.Size());
+    const auto* in_data = in.Data<std::string>();
+    auto* out_data = out.MutableData<DstType>();
+    for (std::ptrdiff_t i = 0; i < shape_size; ++i) {
+      CastFromString(in_data[i], out_data[i]);
+    }
  }
 };

 #if defined(_M_AMD64)
+// specializations to use optimized and Windows x64-specific
+// MlasConvertHalfToFloatBuffer() routine for MLFloat16 -> float conversion
+
+template <typename DstType>
+void CastMLFloat16ThroughFloat(
+    const OpKernelContext& context, const Tensor& in, Tensor& out, const TensorShape& shape) {
+  // use optimized MLFloat16 -> float, then float -> DstType
+  AllocatorPtr allocator;
+  ORT_THROW_IF_ERROR(context.GetTempSpaceAllocator(&allocator));
+  auto intermediate_buffer = IAllocator::MakeUniquePtr<float>(allocator, gsl::narrow<size_t>(shape.Size()));
+  Tensor intermediate_tensor{DataTypeImpl::GetType<float>(), shape, intermediate_buffer.get(), allocator->Info()};
+  TensorCaster<MLFloat16, float>{}.Cast(context, in, intermediate_tensor, shape);
+  TensorCaster<float, DstType>{}.Cast(context, intermediate_tensor, out, shape);
+}
+
+// tensor MLFloat16 -> X
+template <typename DstType>
+struct TensorCaster<MLFloat16, DstType> {
+  void Cast(const OpKernelContext& context, const Tensor& in, Tensor& out, const TensorShape& shape) const {
+    CastMLFloat16ThroughFloat<DstType>(context, in, out, shape);
+  }
+};
+
 // tensor MLFloat16 -> float
 template <>
 struct TensorCaster<MLFloat16, float> {
-  void Cast(const Tensor& in, Tensor& out, const TensorShape& shape) const {
+  void Cast(const OpKernelContext&, const Tensor& in, Tensor& out, const TensorShape& shape) const {
    auto out_data = out.MutableData<float>();
    auto in_data = in.Data<MLFloat16>();
    const size_t shape_size = gsl::narrow<size_t>(shape.Size());
    MlasConvertHalfToFloatBuffer(&in_data[0].val, out_data, shape_size);
  }
 };
+
+// tensor MLFloat16 -> string
+template <>
+struct TensorCaster<MLFloat16, std::string> {
+  void Cast(const OpKernelContext& context, const Tensor& in, Tensor& out, const TensorShape& shape) const {
+    CastMLFloat16ThroughFloat<std::string>(context, in, out, shape);
+  }
+};
 #endif

 class Cast final : public OpKernel {
@ -246,17 +275,18 @@ class Cast final : public OpKernel {

 template <typename TSrc, typename TDst>
 struct Dispatcher {
-  void operator()(const Tensor& src, Tensor& dst, const TensorShape& shape) {
-    TensorCaster<TSrc, TDst>{}.Cast(src, dst, shape);
+  void operator()(const OpKernelContext& context, const Tensor& src, Tensor& dst, const TensorShape& shape) {
+    TensorCaster<TSrc, TDst>{}.Cast(context, src, dst, shape);
  }
 };

 template <typename TSrc>
 struct SrcDispatcher {
-  void operator()(int32_t to, const Tensor& src, Tensor& dst, const TensorShape& shape) {
-    using DstTypes = mp_remove_if_q<EnabledDstTypes, mp_bind_front<std::is_same, TSrc>>;
+  void operator()(
+      int32_t to, const OpKernelContext& context, const Tensor& src, Tensor& dst, const TensorShape& shape) {
+    using DstTypes = boost::mp11::mp_remove_if_q<EnabledDstTypes, boost::mp11::mp_bind_front<std::is_same, TSrc>>;
    utils::MLTypeCallDispatcherFromTypeList<DstTypes> dispatcher{to};
-    dispatcher.template InvokeWithLeadingTemplateArgs<Dispatcher, TypeList<TSrc>>(src, dst, shape);
+    dispatcher.template InvokeWithLeadingTemplateArgs<Dispatcher, TypeList<TSrc>>(context, src, dst, shape);
  }
 };

@ -278,7 +308,7 @@ Status Cast::Compute(OpKernelContext* context) const {
  }

  utils::MLTypeCallDispatcherFromTypeList<EnabledSrcTypes> dispatcher{from};
-  dispatcher.Invoke<SrcDispatcher>(to_, *X, *Y, shape);
+  dispatcher.Invoke<SrcDispatcher>(to_, *context, *X, *Y, shape);

  return Status::OK();
 }
--- a/onnxruntime/core/providers/op_kernel_type_control.h
+++ b/onnxruntime/core/providers/op_kernel_type_control.h
@ -273,20 +273,25 @@ struct EnabledTypes {
 *
 * In MyProvider provider's implementation of MyOp kernel:
 *
+ * namespace onnxruntime {
+ * namespace op_kernel_type_control {
 * // specify supported types, i.e., the full set of types that can be enabled
 * ORT_SPECIFY_OP_KERNEL_ARG_SUPPORTED_TYPES(
 *     MyProvider, DomainContainingMyOp, MyOp, Input, 0,
 *     int, float, double);
+ * }  // namespace op_kernel_type_control
+ * }  // namespace onnxruntime
+ *
+ * // ...
 *
 * // get enabled types
 * using MyOpFirstInputEnabledTypes =
- *     ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(MyProvider, DomainContainingMyOp, MyOp, Input, 0)
+ *     ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(MyProvider, DomainContainingMyOp, MyOp, Input, 0);
 *
- * ...
+ * // ...
 *
- * // in the implementation, we can dispatch to the enabled types
- * utils::MLTypeCallDispatcherFromTypeList<MyOpFirstInputEnabledTypes> dispatcher{firstInputRuntimeType};
- * ...
+ * // use MLTypeCallDispatcher to dispatch to implementations for enabled types
+ * using Dispatcher = onnxruntime::utils::MLTypeCallDispatcherFromTypeList<MyOpFirstInputEnabledTypes>;
 */

 // all allowed type specifications should be contained in the following file
--- a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
@ -0,0 +1,188 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <type_traits>
+
+#include "boost/mp11.hpp"
+
+#include "gsl/gsl"
+
+#include "gtest/gtest.h"
+
+#include "core/framework/data_types_internal.h"
+
+#include "test/common/cuda_op_test_utils.h"
+#include "test/providers/provider_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+template <typename T>
+int GetMinRequiredCudaComputeCapability() {
+  return 0;
+}
+
+template <>
+int GetMinRequiredCudaComputeCapability<MLFloat16>() {
+  return 530;
+}
+
+template <>
+int GetMinRequiredCudaComputeCapability<BFloat16>() {
+  return 800;
+}
+
+template <typename SrcType,
+          typename DstType>
+void TestCastOp(gsl::span<const SrcType> input,
+                gsl::span<const DstType> output,
+                const std::vector<int64_t>& dimensions,
+                OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess,
+                const std::string& expected_failure_string = "") {
+  OpTester test("Cast", 13);
+  test.AddAttribute<int64_t>("to", utils::ToTensorProtoElementType<DstType>());
+  test.AddInput<SrcType>("input", dimensions, input.data(), input.size());
+  test.AddOutput<DstType>("output", dimensions, output.data(), output.size());
+
+  std::unordered_set<std::string> excluded_provider_types{kTensorrtExecutionProvider};
+  const auto min_required_cuda_compute_capability =
+      std::max(GetMinRequiredCudaComputeCapability<SrcType>(), GetMinRequiredCudaComputeCapability<DstType>());
+  if (!HasCudaEnvironment(min_required_cuda_compute_capability)) {
+    excluded_provider_types.insert(kCudaExecutionProvider);
+  }
+
+  test.Run(expect_result, expected_failure_string, excluded_provider_types);
+}
+
+template <typename T>
+using RequiresCastThroughFloat =
+    boost::mp11::mp_any<
+        std::is_same<T, MLFloat16>,
+        std::is_same<T, BFloat16>>;
+
+template <typename... T>
+using AnyRequireCastThroughFloat = boost::mp11::mp_any<RequiresCastThroughFloat<T>...>;
+
+template <typename SrcType, typename DstType>
+typename std::enable_if<AnyRequireCastThroughFloat<SrcType, DstType>::value>::type
+CastSpan(gsl::span<const SrcType> src, gsl::span<DstType> dst) {
+  std::transform(
+      src.begin(), src.end(), dst.begin(),
+      [](SrcType s) {
+        return static_cast<DstType>(static_cast<float>(s));
+      });
+}
+
+template <typename SrcType, typename DstType>
+typename std::enable_if<!AnyRequireCastThroughFloat<SrcType, DstType>::value>::type
+CastSpan(gsl::span<const SrcType> src, gsl::span<DstType> dst) {
+  std::transform(
+      src.begin(), src.end(), dst.begin(),
+      [](SrcType s) {
+        return static_cast<DstType>(s);
+      });
+}
+
+template <typename SrcType, typename DstType>
+std::vector<DstType> CastedValues(gsl::span<const SrcType> src) {
+  std::vector<DstType> result(src.size());
+  CastSpan<SrcType, DstType>(src, gsl::make_span(result));
+  return result;
+}
+
+struct CastNonStringTester {
+  template <typename SrcType, typename DstType>
+  void operator()(const std::pair<SrcType, DstType>&) {
+    SCOPED_TRACE(
+        onnxruntime::MakeString(
+            "Cast from type ", utils::ToTensorProtoElementType<SrcType>(),
+            " to type ", utils::ToTensorProtoElementType<DstType>()));
+
+    const std::vector<int> input_int_values{
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+    const TensorShape shape{2, 3, 2, 2};
+    const size_t size = gsl::narrow<size_t>(shape.Size());
+    ASSERT_EQ(input_int_values.size(), size);
+
+    auto input_buffer = onnxruntime::make_unique<SrcType[]>(size);
+    auto input_span = gsl::make_span<SrcType>(input_buffer.get(), size);
+    CastSpan<int, SrcType>(gsl::make_span(input_int_values), input_span);
+
+    auto output_buffer = onnxruntime::make_unique<DstType[]>(size);
+    auto output_span = gsl::make_span<DstType>(output_buffer.get(), size);
+    CastSpan<SrcType, DstType>(input_span, output_span);
+
+    TestCastOp<SrcType, DstType>(input_span, output_span, shape.GetDims());
+  }
+};
+
+using CastNonStringTypes =
+    boost::mp11::mp_list<
+        bool,
+        float, double,
+        uint8_t, uint16_t, uint32_t, uint64_t,
+        int8_t, int16_t, int32_t, int64_t,
+        MLFloat16, BFloat16>;
+
+TEST(CastOpTest, NonStringTypes) {
+  boost::mp11::mp_for_each<boost::mp11::mp_product<std::pair, CastNonStringTypes, CastNonStringTypes>>(
+      CastNonStringTester{});
+}
+
+TEST(CastOpTest, FromString) {
+  const std::vector<int64_t> shape{2, 2, 2};
+  const std::vector<std::string> string_data = {"-inf", "+INF", "0.9767611", "0.28280696",
+                                                "-0.12019656", "5.0", "NaN", "nan"};
+  const std::vector<float> float_output = {-(std::numeric_limits<float>::infinity()), std::numeric_limits<float>::infinity(),
+                                           0.9767611f, 0.28280696f,
+                                           -0.12019656f, 5.0f, NAN, NAN};
+  TestCastOp(gsl::make_span(string_data), gsl::make_span(float_output), shape);
+
+  const std::vector<std::string> float16_string_data = {"-inf", "+INF", "0.5", "0.25",
+                                                        "0.0", "-1.0", "-1.5", "NaN"};
+  const std::vector<MLFloat16> float16_output =
+      CastedValues<float, MLFloat16>(
+          gsl::make_span(
+              std::vector<float>{
+                  -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(), 0.5f, 0.25f,
+                  0.0f, -1.0f, -1.5f, NAN}));
+  TestCastOp(gsl::make_span(float16_string_data), gsl::make_span(float16_output), shape);
+
+  const std::vector<std::string> int_16_string_data = {"0", "1", "2", "3", "4", "5", "-32768", "32767"};
+  const std::vector<int16_t> int_16_output = {0, 1, 2, 3, 4, 5, SHRT_MIN, SHRT_MAX};
+  TestCastOp(gsl::make_span(int_16_string_data), gsl::make_span(int_16_output), shape);
+
+  const std::vector<std::string> int_64_string_data = {"0", "1", "2", "3", "4", "5", "-9223372036854775808", "9223372036854775807"};
+  const std::vector<int64_t> int_64_output = {0, 1, 2, 3, 4, 5, LLONG_MIN, LLONG_MAX};
+  TestCastOp(gsl::make_span(int_64_string_data), gsl::make_span(int_64_output), shape);
+}
+
+TEST(CastOpTest, ToString) {
+  const std::vector<int64_t> shape{2, 2, 2};
+  const std::vector<float> float_input = {NAN, -1.f, 0.0391877927f, 0.296140194f, -0.120196559f, 5.0f,
+                                          -std::numeric_limits<float>::infinity(),
+                                          std::numeric_limits<float>::infinity()};
+
+  // float output precision is 8, so the expected output differs slightly from the input due to that
+  const std::vector<std::string> string_output = {"NaN", "-1", "0.039187793", "0.29614019",
+                                                  "-0.12019656", "5", "-INF", "INF"};
+  TestCastOp(gsl::make_span(float_input), gsl::make_span(string_output), shape);
+
+  const std::vector<MLFloat16> float16_input =
+      CastedValues<float, MLFloat16>(
+          gsl::make_span(
+              std::vector<float>{
+                  -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(), 0.5f, 0.25f,
+                  0.0f, -1.0f, -1.5f, NAN}));
+  const std::vector<std::string> float16_string_output = {"-INF", "INF", "0.5", "0.25",
+                                                          "0", "-1", "-1.5", "NaN"};
+  TestCastOp(gsl::make_span(float16_input), gsl::make_span(float16_string_output), shape);
+
+  const std::vector<std::string> int_string_data = {"0", "1", "2", "3", "4", "5", "6", "7"};
+  const std::vector<int16_t> int_16_input = {0, 1, 2, 3, 4, 5, 6, 7};
+  TestCastOp(gsl::make_span(int_16_input), gsl::make_span(int_string_data), shape);
+}
+
+}  // namespace test
+}  // namespace onnxruntime
--- a/onnxruntime/test/providers/cpu/tensor/tensor_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/tensor_op_test.cc
@ -84,245 +84,6 @@ TEST(TensorOpTest, ShapeTest3D) {
  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: volume of dimensions is not consistent with weights size
 }

-template <typename SrcType,
-          typename DstType>
-void TestCastOp(const std::initializer_list<SrcType>& input,
-                const std::initializer_list<DstType>& output,
-                const std::vector<int64_t>& dimensions,
-                int64_t toType,
-                ExpectResult expect_result = ExpectResult::kExpectSuccess,
-                const std::string& expected_failure_string = "") {
-  OpTester test("Cast", 9);
-  test.AddAttribute("to", toType);
-  test.AddInput<SrcType>("input", dimensions, input);
-  test.AddOutput<DstType>("output", dimensions, output);
-  test.Run(expect_result, expected_failure_string, {kTensorrtExecutionProvider});
-}
-
-template <typename SrcType>
-void TestCastFromSrc() {
-  std::initializer_list<SrcType> input_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  const std::vector<int64_t> shape{3, 2, 2};
-
-  auto float_output = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-  TestCastOp(input_data, float_output, shape, TensorProto::FLOAT);
-
-  auto double_output = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0};
-  TestCastOp(input_data, double_output, shape, TensorProto::DOUBLE);
-
-  auto bool_output = {false, true, true, true, true, true, true, true, true, true, true, true};
-  TestCastOp(input_data, bool_output, shape, TensorProto::BOOL);
-
-  const std::initializer_list<uint8_t> uint8_t_output{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(input_data, uint8_t_output, shape, TensorProto::UINT8);
-
-  const std::initializer_list<uint16_t> uint16_t_output{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(input_data, uint16_t_output, shape, TensorProto::UINT16);
-
-  const std::initializer_list<uint32_t> uint32_t_output{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(input_data, uint32_t_output, shape, TensorProto::UINT32);
-
-  const std::initializer_list<uint64_t> uint64_t_output{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(input_data, uint64_t_output, shape, TensorProto::UINT64);
-
-  const std::initializer_list<int16_t> int16_t_output{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(input_data, int16_t_output, shape, TensorProto::INT16);
-
-  const std::initializer_list<int32_t> int32_t_output{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(input_data, int32_t_output, shape, TensorProto::INT32);
-
-  const std::initializer_list<int64_t> int64_t_output{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(input_data, int64_t_output, shape, TensorProto::INT64);
-};
-
-TEST(TensorOpTest, Cast) {
-  TestCastFromSrc<float>();
-  TestCastFromSrc<double>();
-  TestCastFromSrc<uint8_t>();
-  TestCastFromSrc<uint16_t>();
-  TestCastFromSrc<uint32_t>();
-  TestCastFromSrc<uint64_t>();
-  TestCastFromSrc<int8_t>();
-  TestCastFromSrc<int16_t>();
-  TestCastFromSrc<int32_t>();
-  TestCastFromSrc<int64_t>();
-}
-
-TEST(TensorOpTest, CastFromBool) {
-  auto bool_data = {false, true, true, true, true, true, true, true, true, true, false, true};
-  const std::vector<int64_t> shape{3, 2, 2};
-
-  const std::initializer_list<float> float_output = {0.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 1.0f};
-  TestCastOp(bool_data, float_output, shape, TensorProto::FLOAT);
-
-  const std::initializer_list<double> double_output = {0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0};
-  TestCastOp(bool_data, double_output, shape, TensorProto::DOUBLE);
-
-  auto bool_output = {false, true, true, true, true, true, true, true, true, true, false, true};
-  TestCastOp(bool_data, bool_output, shape, TensorProto::BOOL);
-
-  const std::initializer_list<uint8_t> uint8_t_output{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1};
-  TestCastOp(bool_data, uint8_t_output, shape, TensorProto::UINT8);
-
-  const std::initializer_list<uint16_t> uint16_t_output{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1};
-  TestCastOp(bool_data, uint16_t_output, shape, TensorProto::UINT16);
-
-  const std::initializer_list<uint32_t> uint32_t_output{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1};
-  TestCastOp(bool_data, uint32_t_output, shape, TensorProto::UINT32);
-
-  const std::initializer_list<uint64_t> uint64_t_output{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1};
-  TestCastOp(bool_data, uint64_t_output, shape, TensorProto::UINT64);
-
-  const std::initializer_list<int16_t> int16_t_output{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1};
-  TestCastOp(bool_data, int16_t_output, shape, TensorProto::INT16);
-
-  const std::initializer_list<int32_t> int32_t_output{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1};
-  TestCastOp(bool_data, int32_t_output, shape, TensorProto::INT32);
-
-  const std::initializer_list<int64_t> int64_t_output{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1};
-  TestCastOp(bool_data, int64_t_output, shape, TensorProto::INT64);
-
-  const std::initializer_list<MLFloat16> float16_output{
-      MLFloat16(math::floatToHalf(0.0f)),
-      MLFloat16(math::floatToHalf(1.0f)),
-      MLFloat16(math::floatToHalf(1.0f)),
-      MLFloat16(math::floatToHalf(1.0f)),
-      MLFloat16(math::floatToHalf(1.0f)),
-      MLFloat16(math::floatToHalf(1.0f)),
-      MLFloat16(math::floatToHalf(1.0f)),
-      MLFloat16(math::floatToHalf(1.0f)),
-      MLFloat16(math::floatToHalf(1.0f)),
-      MLFloat16(math::floatToHalf(1.0f)),
-      MLFloat16(math::floatToHalf(0.0f)),
-      MLFloat16(math::floatToHalf(1.0f))};
-  TestCastOp(bool_data, float16_output, shape, TensorProto::FLOAT16);
-}
-
-TEST(TensorOpTest, CastToFloat16) {
-  const std::vector<int64_t> shape{3, 2, 2};
-  std::initializer_list<float> float_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-  const std::initializer_list<MLFloat16> float16_output{
-      MLFloat16(math::floatToHalf(0.0f)),
-      MLFloat16(math::floatToHalf(1.0f)),
-      MLFloat16(math::floatToHalf(2.0f)),
-      MLFloat16(math::floatToHalf(3.0f)),
-      MLFloat16(math::floatToHalf(4.0f)),
-      MLFloat16(math::floatToHalf(5.0f)),
-      MLFloat16(math::floatToHalf(6.0f)),
-      MLFloat16(math::floatToHalf(7.0f)),
-      MLFloat16(math::floatToHalf(8.0f)),
-      MLFloat16(math::floatToHalf(9.0f)),
-      MLFloat16(math::floatToHalf(10.0f)),
-      MLFloat16(math::floatToHalf(11.0f))};
-
-  TestCastOp(float_data, float16_output, shape, TensorProto::FLOAT16);
-
-  std::initializer_list<uint8_t> uint8_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(uint8_t_data, float16_output, shape, TensorProto::FLOAT16);
-
-  std::initializer_list<uint16_t> uint16_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(uint16_t_data, float16_output, shape, TensorProto::FLOAT16);
-
-  std::initializer_list<uint32_t> uint32_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(uint32_t_data, float16_output, shape, TensorProto::FLOAT16);
-
-  std::initializer_list<uint64_t> uint64_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(uint64_t_data, float16_output, shape, TensorProto::FLOAT16);
-
-  std::initializer_list<int8_t> int8_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(int8_t_data, float16_output, shape, TensorProto::FLOAT16);
-
-  std::initializer_list<int16_t> int16_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(int16_t_data, float16_output, shape, TensorProto::FLOAT16);
-
-  std::initializer_list<int32_t> int32_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(int32_t_data, float16_output, shape, TensorProto::FLOAT16);
-
-  std::initializer_list<int64_t> int64_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(int64_t_data, float16_output, shape, TensorProto::FLOAT16);
-}
-
-TEST(TensorOpTest, CastFromFloat16) {
-  const std::vector<int64_t> shape{3, 2, 2};
-  const std::initializer_list<float> float_output = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-  const std::initializer_list<MLFloat16> input = {
-      MLFloat16(math::floatToHalf(0.0f)),
-      MLFloat16(math::floatToHalf(1.0f)),
-      MLFloat16(math::floatToHalf(2.0f)),
-      MLFloat16(math::floatToHalf(3.0f)),
-      MLFloat16(math::floatToHalf(4.0f)),
-      MLFloat16(math::floatToHalf(5.0f)),
-      MLFloat16(math::floatToHalf(6.0f)),
-      MLFloat16(math::floatToHalf(7.0f)),
-      MLFloat16(math::floatToHalf(8.0f)),
-      MLFloat16(math::floatToHalf(9.0f)),
-      MLFloat16(math::floatToHalf(10.0f)),
-      MLFloat16(math::floatToHalf(11.0f))};
-
-  TestCastOp(input, float_output, shape, TensorProto::FLOAT);
-
-  auto bool_data = {false, true, true, true, true, true, true, true, true, true, true, true};
-  TestCastOp(input, bool_data, shape, TensorProto::BOOL);
-
-  std::initializer_list<uint8_t> uint8_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(input, uint8_t_data, shape, TensorProto::UINT8);
-
-  std::initializer_list<uint16_t> uint16_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(input, uint16_t_data, shape, TensorProto::UINT16);
-
-  std::initializer_list<uint32_t> uint32_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(input, uint32_t_data, shape, TensorProto::UINT32);
-
-  std::initializer_list<uint64_t> uint64_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(input, uint64_t_data, shape, TensorProto::UINT64);
-
-  std::initializer_list<int8_t> int8_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(input, int8_t_data, shape, TensorProto::INT8);
-
-  std::initializer_list<int16_t> int16_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(input, int16_t_data, shape, TensorProto::INT16);
-
-  std::initializer_list<int32_t> int32_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(input, int32_t_data, shape, TensorProto::INT32);
-
-  std::initializer_list<int64_t> int64_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  TestCastOp(input, int64_t_data, shape, TensorProto::INT64);
-}
-
-TEST(TensorOpTest, CastFromString) {
-  const std::vector<int64_t> shape{2, 2, 2};
-  std::initializer_list<std::string> string_data = {"-inf", "+INF", "0.9767611", "0.28280696",
-                                                    "-0.12019656", "5.0", "NaN", "nan"};
-  const std::initializer_list<float> float_output = {-(std::numeric_limits<float>::infinity()), std::numeric_limits<float>::infinity(),
-                                                     0.9767611f, 0.28280696f,
-                                                     -0.12019656f, 5.0f, NAN, NAN};
-  TestCastOp(string_data, float_output, shape, TensorProto::FLOAT);
-
-  std::initializer_list<std::string> int_16_string_data = {"0", "1", "2", "3", "4", "5", "-32768", "32767"};
-  const std::initializer_list<int16_t> int_16_output = {0, 1, 2, 3, 4, 5, SHRT_MIN, SHRT_MAX};
-  TestCastOp(int_16_string_data, int_16_output, shape, TensorProto::INT16);
-
-  std::initializer_list<std::string> int_64_string_data = {"0", "1", "2", "3", "4", "5", "-9223372036854775808", "9223372036854775807"};
-  const std::initializer_list<int64_t> int_64_output = {0, 1, 2, 3, 4, 5, LLONG_MIN, LLONG_MAX};
-  TestCastOp(int_64_string_data, int_64_output, shape, TensorProto::INT64);
-}
-
-TEST(TensorOpTest, CastToString) {
-  const std::vector<int64_t> shape{2, 2, 2};
-  const std::initializer_list<float> float_input = {NAN, -1.f, 0.0391877927f, 0.296140194f, -0.120196559f, 5.0f,
-                                                    -std::numeric_limits<float>::infinity(),
-                                                    std::numeric_limits<float>::infinity()};
-
-  // float output precision is 8, so the expected output differs slightly from the input due to that
-  std::initializer_list<std::string> string_output = {"NaN", "-1", "0.039187793", "0.29614019",
-                                                      "-0.12019656", "5", "-INF", "INF"};
-  TestCastOp(float_input, string_output, shape, TensorProto::STRING);
-
-  std::initializer_list<std::string> int_string_data = {"0", "1", "2", "3", "4", "5", "6", "7"};
-  const std::initializer_list<int16_t> int_16_input = {0, 1, 2, 3, 4, 5, 6, 7};
-  TestCastOp(int_16_input, int_string_data, shape, TensorProto::STRING);
-}
-
 void MeanVarianceNormalizationFunctionDefaultPerChannel() {
  const int64_t N = 2, C = 2, H = 2, W = 3;

--- a/onnxruntime/test/providers/provider_test_utils.cc
+++ b/onnxruntime/test/providers/provider_test_utils.cc
@ -251,9 +251,11 @@ void Check<MLFloat16>(const OpTester::Data& expected_data,
  threshold = 0.005f;
 #endif
  for (int i = 0; i < size; ++i) {
-    if (std::isinf(f_expected[i]))  // Test infinity for equality
-      EXPECT_EQ(f_expected[i], f_output[i]) << "i:" << i;
-    else {
+    if (std::isnan(f_expected[i])) {
+      EXPECT_TRUE(std::isnan(f_expected[i])) << "Expected NaN. i:" << i << ", provider_type: " << provider_type;
+    } else if (std::isinf(f_expected[i])) {  // Test infinity for equality
+      EXPECT_EQ(f_expected[i], f_output[i]) << "Expected infinity. i:" << i << ", provider_type: " << provider_type;
+    } else {
      // the default for existing tests
      EXPECT_NEAR(f_expected[i], f_output[i], threshold)
          << "i:" << i << ", provider_type: " << provider_type;
@ -284,9 +286,11 @@ void Check<BFloat16>(const OpTester::Data& expected_data,
  /// XXX: May need to adjust threshold as BFloat is coarse
  float threshold = 0.001f;
  for (int i = 0; i < size; ++i) {
-    if (std::isinf(f_expected[i]))  // Test infinity for equality
-      EXPECT_EQ(f_expected[i], f_output[i]);
-    else {
+    if (std::isnan(f_expected[i])) {
+      EXPECT_TRUE(std::isnan(f_expected[i])) << "Expected NaN. i:" << i << ", provider_type: " << provider_type;
+    } else if (std::isinf(f_expected[i])) {  // Test infinity for equality
+      EXPECT_EQ(f_expected[i], f_output[i]) << "Expected infinity. i:" << i << ", provider_type: " << provider_type;
+    } else {
      // the default for existing tests
      const float max_value = fmax(fabs(f_expected[i]), fabs(f_output[i]));
      if (max_value != 0) {  // max_value = 0 means output and expected are 0s.