Implement Mod operator (#900)

Implement Mod operator
2026-06-25 02:50:42 +00:00 · 2019-04-25 17:49:11 -07:00 · 2019-04-25 17:49:11 -07:00 · 893b48e92a
commit 893b48e92a
parent b8eaa88bd4
5 changed files with 427 additions and 35 deletions
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@ -241,6 +241,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, St
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, TopK);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, MaxPool);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, AveragePool);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, Mod);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, float, Resize);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int32_t, Resize);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t, Resize);
@ -501,6 +502,7 @@ void RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, TopK)>,
      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, MaxPool)>,
      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, Mod)>,
      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, float, Resize)>,
      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int32_t, Resize)>,
      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t, Resize)>,
--- a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
+++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
@ -3,6 +3,9 @@

 #include "core/providers/cpu/math/element_wise_ops.h"
 #include <unsupported/Eigen/SpecialFunctions>
+#include "core/util/math.h"
+
+#include <cmath>

 namespace onnxruntime {

@ -1012,7 +1015,7 @@ REG_EXPAND_KERNEL(bool)
 REG_EXPAND_KERNEL(MLFloat16)

 #ifndef DISABLE_CONTRIB_OPS
-namespace contrib{
+namespace contrib {
 template <>
 Status Scale<float>::Compute(OpKernelContext* ctx) const {
  auto& X = *ctx->Input<Tensor>(0);
@ -1020,7 +1023,7 @@ Status Scale<float>::Compute(OpKernelContext* ctx) const {
  EigenMap<float>(Y) = scale_ * EigenMap<float>(X);
  return Status::OK();
 }
-}
+}  // namespace contrib
 #endif

 template <>
@ -1034,4 +1037,226 @@ Status Erf<float>::Compute(OpKernelContext* context) const {
  return Status::OK();
 }

+class Mod final : public OpKernel {
+ public:
+  Mod(const OpKernelInfo& info) : OpKernel(info) {
+    int64_t fmod = 0;
+    Status s = info.GetAttr<int64_t>("fmod", &fmod);
+    if (s.IsOK()) {
+      ORT_ENFORCE((fmod == 0) || (fmod == 1), "fmod must have value either 0 or 1");
+      fmod_ = (fmod == 1);
+    }
+  }
+
+  Status Compute(OpKernelContext* context) const override;
+
+ private:
+  bool fmod_{false};
+};
+
+ONNX_CPU_OPERATOR_KERNEL(
+    Mod,
+    10,
+    KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                                            DataTypeImpl::GetTensorType<double>(),
+                                            DataTypeImpl::GetTensorType<int64_t>(),
+                                            DataTypeImpl::GetTensorType<uint64_t>(),
+                                            DataTypeImpl::GetTensorType<int32_t>(),
+                                            DataTypeImpl::GetTensorType<uint32_t>(),
+                                            DataTypeImpl::GetTensorType<int16_t>(),
+                                            DataTypeImpl::GetTensorType<uint16_t>(),
+                                            DataTypeImpl::GetTensorType<int8_t>(),
+                                            DataTypeImpl::GetTensorType<uint8_t>(),
+                                            DataTypeImpl::GetTensorType<MLFloat16>()}),
+    Mod);
+
+namespace mod_internal {
+
+template <class T>
+void BroadCastFMod(const Tensor& X, const Tensor& Y, OpKernelContext* context) {
+  TBroadcaster<T, T> mod_broadcaster{X, Y};
+  Tensor* const output = context->Output(0, mod_broadcaster.GetOutputShape());
+  ORT_ENFORCE(output, "failed to get first output!");
+  TBroadcastOutput<T> mod_broadcast_output{
+      mod_broadcaster.GetSpanSize(), *output};
+
+  BroadcastLoopSpan(
+      mod_broadcaster, mod_broadcast_output,
+      [](gsl::span<T> output, const T& X, gsl::span<const T> Y) {
+        std::transform(Y.cbegin(), Y.cend(), output.begin(),
+                       [X](auto y) {
+                         return static_cast<T>(std::fmod(X, y));
+                       });
+      },
+      [](gsl::span<T> output, gsl::span<const T> X, const T& Y) {
+        std::transform(X.cbegin(), X.cend(), output.begin(),
+                       [Y](auto x) {
+                         return static_cast<T>(std::fmod(x, Y));
+                       });
+      },
+      [](gsl::span<T> output, gsl::span<const T> X, gsl::span<const T> Y) {
+        std::transform(
+            X.cbegin(), X.cend(), Y.cbegin(), output.begin(),
+            [](auto x, auto y) {
+              return static_cast<T>(std::fmod(x, y));
+            });
+      });
+}
+
+template <class T>
+inline T Modulus(T x, T y) {
+  auto res = x % y;
+  if ((res < 0 && y > 0) || (res > 0 && y < 0)) {
+    res += y;
+  }
+  return static_cast<T>(res);
+}
+
+template <class T>
+void BroadCastMod(const Tensor& X, const Tensor& Y, OpKernelContext* context) {
+  TBroadcaster<T, T> mod_broadcaster{X, Y};
+  Tensor* const output = context->Output(0, mod_broadcaster.GetOutputShape());
+  ORT_ENFORCE(output, "failed to get first output!");
+  TBroadcastOutput<T> mod_broadcast_output{
+      mod_broadcaster.GetSpanSize(), *output};
+
+  // static_cast below are necessary when small types such as
+  // int16_t and int8_t are converted to integers to perform remainder
+  // operation. This cast is safe with respect to data loss.
+  BroadcastLoopSpan(
+      mod_broadcaster, mod_broadcast_output,
+      [](gsl::span<T> output, const T& X, gsl::span<const T> Y) {
+        std::transform(Y.cbegin(), Y.cend(), output.begin(),
+                       [X](auto y) {
+                         return Modulus(X, y);
+                       });
+      },
+      [](gsl::span<T> output, gsl::span<const T> X, const T& Y) {
+        std::transform(X.cbegin(), X.cend(), output.begin(),
+                       [Y](auto x) {
+                         return Modulus(x, Y);
+                       });
+      },
+      [](gsl::span<T> output, gsl::span<const T> X, gsl::span<const T> Y) {
+        std::transform(
+            X.cbegin(), X.cend(), Y.cbegin(), output.begin(),
+            [](auto x, auto y) {
+              return Modulus(x, y);
+            });
+      });
+}
+
+void BroadCastMFloat16FMod(const Tensor& X, const Tensor& Y, OpKernelContext* context) {
+  TBroadcaster<MLFloat16, MLFloat16> mod_broadcaster{X, Y};
+  Tensor* const output = context->Output(0, mod_broadcaster.GetOutputShape());
+  ORT_ENFORCE(output, "failed to get first output!");
+  TBroadcastOutput<MLFloat16> mod_broadcast_output{
+      mod_broadcaster.GetSpanSize(), *output};
+
+  BroadcastLoopSpan(
+      mod_broadcaster, mod_broadcast_output,
+      [](gsl::span<MLFloat16> output, const MLFloat16& X, gsl::span<const MLFloat16> Y) {
+        std::transform(Y.cbegin(), Y.cend(), output.begin(),
+                       [X_fl = math::halfToFloat(X.val)](const MLFloat16& y) {
+                         return MLFloat16(math::floatToHalf(std::fmod(X_fl, math::halfToFloat(y.val))));
+                       });
+      },
+      [](gsl::span<MLFloat16> output, gsl::span<const MLFloat16> X, const MLFloat16& Y) {
+        std::transform(X.cbegin(), X.cend(), output.begin(),
+                       [Y_fl = math::halfToFloat(Y.val)](const MLFloat16& x) {
+                         return MLFloat16(math::floatToHalf(std::fmod(math::halfToFloat(x.val), Y_fl)));
+                       });
+      },
+      [](gsl::span<MLFloat16> output, gsl::span<const MLFloat16> X, gsl::span<const MLFloat16> Y) {
+        std::transform(
+            X.cbegin(), X.cend(), Y.cbegin(), output.begin(),
+            [](const MLFloat16& x, const MLFloat16& y) {
+              auto x_fl = math::halfToFloat(x.val);
+              auto y_fl = math::halfToFloat(y.val);
+              return MLFloat16(math::floatToHalf(std::fmod(x_fl, y_fl)));
+            });
+      });
+}
+
+}  // namespace mod_internal
+
+Status Mod::Compute(OpKernelContext* context) const {
+  Status s;
+
+  const auto& X = *context->Input<Tensor>(0);
+  const auto& Y = *context->Input<Tensor>(1);
+
+  auto dtype = X.DataType();
+  if (dtype != Y.DataType()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "X and Y input types do not match: ",
+                           dtype, " vs ", Y.DataType());
+  }
+
+  using namespace mod_internal;
+
+  if (dtype == DataTypeImpl::GetType<float>()) {
+    ORT_ENFORCE(fmod_, "fmod attribute must be true for float, float16 and double types");
+    BroadCastFMod<float>(X, Y, context);
+  } else if (dtype == DataTypeImpl::GetType<double>()) {
+    ORT_ENFORCE(fmod_, "fmod attribute must be true for float, float16 and double types");
+    BroadCastFMod<double>(X, Y, context);
+  } else if (dtype == DataTypeImpl::GetType<MLFloat16>()) {
+    ORT_ENFORCE(fmod_, "fmod attribute must be true for float, float16 and double types");
+    BroadCastMFloat16FMod(X, Y, context);
+  } else if (dtype == DataTypeImpl::GetType<uint8_t>()) {
+    if (fmod_) {
+      BroadCastFMod<uint8_t>(X, Y, context);
+    } else {
+      BroadCastMod<uint8_t>(X, Y, context);
+    }
+  } else if (dtype == DataTypeImpl::GetType<int8_t>()) {
+    if (fmod_) {
+      BroadCastFMod<int8_t>(X, Y, context);
+    } else {
+      BroadCastMod<int8_t>(X, Y, context);
+    }
+  } else if (dtype == DataTypeImpl::GetType<uint16_t>()) {
+    if (fmod_) {
+      BroadCastFMod<uint16_t>(X, Y, context);
+    } else {
+      BroadCastMod<uint16_t>(X, Y, context);
+    }
+  } else if (dtype == DataTypeImpl::GetType<int16_t>()) {
+    if (fmod_) {
+      BroadCastFMod<int16_t>(X, Y, context);
+    } else {
+      BroadCastMod<int16_t>(X, Y, context);
+    }
+  } else if (dtype == DataTypeImpl::GetType<uint32_t>()) {
+    if (fmod_) {
+      BroadCastFMod<uint32_t>(X, Y, context);
+    } else {
+      BroadCastMod<uint32_t>(X, Y, context);
+    }
+  } else if (dtype == DataTypeImpl::GetType<int32_t>()) {
+    if (fmod_) {
+      BroadCastFMod<int32_t>(X, Y, context);
+    } else {
+      BroadCastMod<int32_t>(X, Y, context);
+    }
+  } else if (dtype == DataTypeImpl::GetType<uint64_t>()) {
+    if (fmod_) {
+      BroadCastFMod<uint64_t>(X, Y, context);
+    } else {
+      BroadCastMod<uint64_t>(X, Y, context);
+    }
+  } else if (dtype == DataTypeImpl::GetType<int64_t>()) {
+    if (fmod_) {
+      BroadCastFMod<int64_t>(X, Y, context);
+    } else {
+      BroadCastMod<int64_t>(X, Y, context);
+    }
+  } else {
+    ORT_ENFORCE(false, "Unsupported data type", dtype);
+  }
+
+  return s;
+}  // namespace onnxruntime
+
 }  // namespace onnxruntime
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@ -236,11 +236,11 @@ int real_main(int argc, char* argv[], OrtEnv** p_env) {
    }

    std::unordered_set<std::string> cuda_flaky_tests = {
-      "fp16_inception_v1", "fp16_shufflenet", "fp16_tiny_yolov2"};
+        "fp16_inception_v1", "fp16_shufflenet", "fp16_tiny_yolov2"};

-#if (defined (_WIN32) && !defined(_WIN64)) || (defined(__GNUG__) && !defined(__LP64__))
+#if (defined(_WIN32) && !defined(_WIN64)) || (defined(__GNUG__) && !defined(__LP64__))
    //Minimize mem consumption
-    LoadTests (data_dirs, whitelisted_test_cases, per_sample_tolerance, relative_per_sample_tolerance, [&stat, &sf, enable_cuda, &cuda_flaky_tests] (ITestCase* l) {
+    LoadTests(data_dirs, whitelisted_test_cases, per_sample_tolerance, relative_per_sample_tolerance, [&stat, &sf, enable_cuda, &cuda_flaky_tests](ITestCase* l) {
      std::unique_ptr<ITestCase> test_case_ptr(l);
      if (enable_cuda && cuda_flaky_tests.find(l->GetTestCaseName()) != cuda_flaky_tests.end()) {
        return;
@ -253,15 +253,14 @@ int real_main(int argc, char* argv[], OrtEnv** p_env) {
    });
 #else
    std::vector<ITestCase*> tests;
-    LoadTests(data_dirs, whitelisted_test_cases, per_sample_tolerance, relative_per_sample_tolerance, [&tests] (ITestCase* l) { tests.push_back(l); });
+    LoadTests(data_dirs, whitelisted_test_cases, per_sample_tolerance, relative_per_sample_tolerance, [&tests](ITestCase* l) { tests.push_back(l); });
    if (enable_cuda) {
      for (auto it = tests.begin(); it != tests.end();) {
        auto iter = cuda_flaky_tests.find((*it)->GetTestCaseName());
        if (iter != cuda_flaky_tests.end()) {
          delete *it;
          it = tests.erase(it);
-        }
-        else {
+        } else {
          ++it;
        }
      }
@ -357,6 +356,7 @@ int real_main(int argc, char* argv[], OrtEnv** p_env) {
      {"tf_mobilenet_v1_1.0_224", "result mismatch"},
      {"mobilenetv2-1.0", "result mismatch"},
      {"mxnet_arcface", "result mismatch"},
+      {"mod_float_mixed_sign_example", "faulty test"}
  };

 #ifdef USE_CUDA
@ -364,7 +364,7 @@ int real_main(int argc, char* argv[], OrtEnv** p_env) {
 #endif
  // clang-format on

-#if defined (_WIN32) && !defined(_WIN64)
+#if defined(_WIN32) && !defined(_WIN64)
  broken_tests["vgg19"] = "failed: bad allocation";
 #endif

--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@ -4,6 +4,7 @@
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 #include "core/util/math.h"
+#include <algorithm>
 #include <cmath>

 namespace onnxruntime {
@ -14,7 +15,7 @@ TEST(MathOpTest, Add_int32) {
  test.AddInput<int32_t>("A", {3}, {1, 2, 3});
  test.AddInput<int32_t>("B", {3}, {4, 5, 6});
  test.AddOutput<int32_t>("C", {3}, {5, 7, 9});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT parser: elementwise inputs must not be Int32
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT parser: elementwise inputs must not be Int32
 }

 TEST(MathOpTest, Add_int64) {
@ -22,7 +23,7 @@ TEST(MathOpTest, Add_int64) {
  test.AddInput<int64_t>("A", {3}, {1, 2, 3});
  test.AddInput<int64_t>("B", {3}, {4, 5, 6});
  test.AddOutput<int64_t>("C", {3}, {5, 7, 9});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: INT64 is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: INT64 is not supported
 }

 TEST(MathOpTest, Add) {
@ -68,7 +69,7 @@ TEST(MathOpTest, Add_Broadcast_0x0) {
  test.AddInput<float>("A", {}, {10.0f});
  test.AddInput<float>("B", {}, {2.0f});
  test.AddOutput<float>("C", {}, {12.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: dynamic shape is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: dynamic shape is not supported
 }

 TEST(MathOpTest, Add_Broadcast_0x1) {
@ -77,7 +78,7 @@ TEST(MathOpTest, Add_Broadcast_0x1) {
  test.AddInput<float>("A", {}, {10.0f});
  test.AddInput<float>("B", {1}, {2.0f});
  test.AddOutput<float>("C", {1}, {12.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: dynamic shape is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: dynamic shape is not supported
 }

 TEST(MathOpTest, Add_Broadcast_1x0) {
@ -86,7 +87,7 @@ TEST(MathOpTest, Add_Broadcast_1x0) {
  test.AddInput<float>("A", {1}, {10.0f});
  test.AddInput<float>("B", {}, {2.0f});
  test.AddOutput<float>("C", {1}, {12.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: dynamic shape is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: dynamic shape is not supported
 }

 TEST(MathOpTest, Add_Broadcast_1x1) {
@ -133,7 +134,7 @@ TEST(MathOpTest, Add_Broadcast_2x1x4_1x3x1) {
                         211.0f, 212.0f, 213.0f, 214.0f,
                         221.0f, 222.0f, 223.0f, 224.0f,
                         231.0f, 232.0f, 233.0f, 234.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//Input batch size is inconsistent
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //Input batch size is inconsistent
 }

 TEST(MathOpTest, Add_Broadcast_2x1x1_3x4) {
@ -153,7 +154,7 @@ TEST(MathOpTest, Add_Broadcast_2x1x1_3x4) {
                         211.0f, 212.0f, 213.0f, 214.0f,
                         221.0f, 222.0f, 223.0f, 224.0f,
                         231.0f, 232.0f, 233.0f, 234.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//Input batch size is inconsistent
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //Input batch size is inconsistent
 }

 TEST(MathOpTest, Sub_int32) {
@ -161,7 +162,7 @@ TEST(MathOpTest, Sub_int32) {
  test.AddInput<int32_t>("A", {3}, {1, 4, 3});
  test.AddInput<int32_t>("B", {3}, {4, 2, 4});
  test.AddOutput<int32_t>("C", {3}, {-3, 2, -1});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT parser:elementwise inputs must not be Int32
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT parser:elementwise inputs must not be Int32
 }

 TEST(MathOpTest, Sub_int64) {
@ -169,7 +170,7 @@ TEST(MathOpTest, Sub_int64) {
  test.AddInput<int64_t>("A", {3}, {1, 5, 6});
  test.AddInput<int64_t>("B", {3}, {4, 5, 3});
  test.AddOutput<int64_t>("C", {3}, {-3, 0, 3});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: INT64 is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: INT64 is not supported
 }

 TEST(MathOpTest, Sub) {
@ -202,7 +203,7 @@ TEST(MathOpTest, Sub_Broadcast_Scalar) {
                        {-4.0f, -3.0f, -6.0f,
                         -5.0f, -3.5f, -105.0f,
                         -10.4f, 4.3f, -10'005.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: dynamic shape is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: dynamic shape is not supported
 }

 TEST(MathOpTest, Mul_int32) {
@ -210,7 +211,7 @@ TEST(MathOpTest, Mul_int32) {
  test.AddInput<int32_t>("A", {3}, {1, 2, 3});
  test.AddInput<int32_t>("B", {3}, {4, -3, 6});
  test.AddOutput<int32_t>("C", {3}, {4, -6, 18});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT parser:elementwise inputs must not be Int32
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT parser:elementwise inputs must not be Int32
 }

 TEST(MathOpTest, Mul_int64) {
@ -218,7 +219,7 @@ TEST(MathOpTest, Mul_int64) {
  test.AddInput<int64_t>("A", {3}, {3, 6, -3});
  test.AddInput<int64_t>("B", {3}, {4, -3, -2});
  test.AddOutput<int64_t>("C", {3}, {12, -18, 6});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: INT64 is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: INT64 is not supported
 }

 TEST(MathOpTest, Mul) {
@ -244,7 +245,7 @@ TEST(MathOpTest, Div_int32) {
  test.AddInput<int32_t>("A", {3}, {4, 8, 8});
  test.AddInput<int32_t>("B", {3}, {1, 3, 2});
  test.AddOutput<int32_t>("C", {3}, {4, 2, 4});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT parser:elementwise inputs must not be Int32
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT parser:elementwise inputs must not be Int32
 }

 TEST(MathOpTest, Div_int64) {
@ -252,7 +253,7 @@ TEST(MathOpTest, Div_int64) {
  test.AddInput<int64_t>("A", {3}, {4, 8, 8});
  test.AddInput<int64_t>("B", {3}, {2, 3, 4});
  test.AddOutput<int64_t>("C", {3}, {2, 2, 2});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: INT64 is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: INT64 is not supported
 }

 TEST(MathOpTest, Div) {
@ -291,7 +292,7 @@ TEST(MathOpTest, Abs_int32) {
  std::vector<int64_t> dims{4};
  test.AddInput<int32_t>("X", dims, {1, 2, -1, -5});
  test.AddOutput<int32_t>("Y", dims, {1, 2, 1, 5});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT parser: Int32 not allowed as input to this layer
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT parser: Int32 not allowed as input to this layer
 }

 TEST(MathOpTest, Neg) {
@ -319,7 +320,7 @@ TEST(MathOpTest, Neg_int32) {
  std::vector<int64_t> dims{4};
  test.AddInput<int32_t>("X", dims, {1, -2, 0, -10});
  test.AddOutput<int32_t>("Y", dims, {-1, 2, 0, 10});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT parser: Int32 not allowed as input to this layer
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT parser: Int32 not allowed as input to this layer
 }

 TEST(MathOpTest, Floor) {
@ -392,7 +393,7 @@ TEST(MathOpTest, Pow_Broadcast_Scalar0) {
  test.AddInput<float>("X", {}, {2.0f});
  test.AddInput<float>("Y", dims, {1.0f, 2.0f, 3.0f});
  test.AddOutput<float>("Z", dims, {2.0f, 4.0f, 8.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: dynamic shape is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: dynamic shape is not supported
 }

 TEST(MathOpTest, Pow_Broadcast_Scalar1) {
@ -402,7 +403,7 @@ TEST(MathOpTest, Pow_Broadcast_Scalar1) {
  test.AddInput<float>("X", dims, {1.0f, 2.0f, 3.0f});
  test.AddInput<float>("Y", {}, {2.0f});
  test.AddOutput<float>("Z", dims, {1.0f, 4.0f, 9.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: dynamic shape is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: dynamic shape is not supported
 }

 TEST(MathOpTest, Exp) {
@ -469,7 +470,7 @@ TEST(MathOpTest, Sum_8_Test1) {
                         311.0f, 312.0f, 313.0f,
                         321.0f, 322.0f, 323.0f,
                         331.0f, 332.0f, 333.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});// TensorRT parser failed on this test
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT parser failed on this test
 }

 TEST(MathOpTest, Sum_8_Test2) {
@ -581,7 +582,7 @@ TEST(MathOpTest, Max_8) {
                        {10.0f, 20.0f, 30.0f,
                         40.0f, 50.0f, 60.0f,
                         300.0f, 300.0f, 300.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//Input batch size is inconsistent
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //Input batch size is inconsistent
 }

 TEST(MathOpTest, Not) {
@ -757,7 +758,7 @@ TEST(MathOpTest, Mean_8) {
                        {12.0f / 3.0f, 22.0f / 3.0f, 32.0f / 3.0f,
                         43.0f / 3.0f, 53.0f / 3.0f, 63.0f / 3.0f,
                         74.0f / 3.0f, 84.0f / 3.0f, 94.0f / 3.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//Input batch size is inconsistent
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //Input batch size is inconsistent
 }

 #ifndef DISABLE_CONTRIB_OPS
@ -1008,6 +1009,173 @@ TEST(MathOpTest, Erf) {
  test.AddOutput<float>("B", dims, {0.5204999f, 0.8427008f, 0.6778012f, 0.9953223f});
  test.Run();
 }
-}  // namespace test

+const int ModOp_ver = 10;
+
+TEST(ModOpTest, Fmod_float_mixed_sign) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddAttribute<int64_t>("fmod", 1);
+  test.AddInput<float>("X", {6}, {-4.3f, 7.2f, 5.0f, 4.3f, -7.2f, 8.0f});
+  test.AddInput<float>("Y", {6}, {2.1f, -3.4f, 8.0f, -2.1f, 3.4f, 5.0f});
+  test.AddOutput<float>("Z", {6}, {-0.1f, 0.4f, 5.f, 0.1f, -0.4f, 3.f});
+
+  test.Run();
+}
+
+std::vector<MLFloat16> MakeMLFloat16(const std::initializer_list<float>& input) {
+  std::vector<MLFloat16> output;
+  std::transform(input.begin(), input.end(), std::back_inserter(output),
+                 [](float fl) {
+                   return MLFloat16(math::floatToHalf(fl));
+                 });
+  return output;
+}
+
+TEST(ModOpTest, Fmod_float16_mixed_sign) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddAttribute<int64_t>("fmod", 1);
+
+  test.AddInput<MLFloat16>("X", {6}, MakeMLFloat16({-4.3f, 7.2f, 5.0f, 4.3f, -7.2f, 8.0f}));
+  test.AddInput<MLFloat16>("Y", {6}, MakeMLFloat16({2.1f, -3.4f, 8.0f, -2.1f, 3.4f, 5.0f}));
+  // The output above is {-0.1f, 0.4f, 5.f, 0.1f, -0.4f, 3.f} for float
+  test.AddOutput<MLFloat16>("Z", {6}, MakeMLFloat16({-0.1015625f, 0.3984375f, 5.f, 0.1015625f, -0.3984375f, 3.f}));
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int8_mixed_sign) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddInput<int8_t>("X", {6}, {-4, 7, 5, 4, -7, 8});
+  test.AddInput<int8_t>("Y", {6}, {2, -3, 8, -2, 3, 5});
+  test.AddOutput<int8_t>("Z", {6}, {0, -2, 5, 0, 2, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int8_mixed_sign_fmod) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddAttribute<int64_t>("fmod", 1);
+
+  test.AddInput<int8_t>("X", {6}, {-4, 7, 5, 4, -7, 8});
+  test.AddInput<int8_t>("Y", {6}, {2, -3, 8, -2, 3, 5});
+  test.AddOutput<int8_t>("Z", {6}, {0, 1, 5, 0, -1, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, UInt8_mod) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddInput<uint8_t>("X", {6}, {4, 7, 5, 4, 7, 8});
+  test.AddInput<uint8_t>("Y", {6}, {2, 3, 8, 2, 3, 5});
+  test.AddOutput<uint8_t>("Z", {6}, {0, 1, 5, 0, 1, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int16_mixed_sign) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddInput<int16_t>("X", {6}, {-4, 7, 5, 4, -7, 8});
+  test.AddInput<int16_t>("Y", {6}, {2, -3, 8, -2, 3, 5});
+  test.AddOutput<int16_t>("Z", {6}, {0, -2, 5, 0, 2, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int16_mixed_sign_fmod) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddAttribute<int64_t>("fmod", 1);
+
+  test.AddInput<int16_t>("X", {6}, {-4, 7, 5, 4, -7, 8});
+  test.AddInput<int16_t>("Y", {6}, {2, -3, 8, -2, 3, 5});
+  test.AddOutput<int16_t>("Z", {6}, {0, 1, 5, 0, -1, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, UInt16_mod) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddInput<uint16_t>("X", {6}, {4, 7, 5, 4, 7, 8});
+  test.AddInput<uint16_t>("Y", {6}, {2, 3, 8, 2, 3, 5});
+  test.AddOutput<uint16_t>("Z", {6}, {0, 1, 5, 0, 1, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int32_mixed_sign) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddInput<int32_t>("X", {6}, {-4, 7, 5, 4, -7, 8});
+  test.AddInput<int32_t>("Y", {6}, {2, -3, 8, -2, 3, 5});
+  test.AddOutput<int32_t>("Z", {6}, {0, -2, 5, 0, 2, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int32_mixed_sign_fmod) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddAttribute<int64_t>("fmod", 1);
+
+  test.AddInput<int32_t>("X", {6}, {-4, 7, 5, 4, -7, 8});
+  test.AddInput<int32_t>("Y", {6}, {2, -3, 8, -2, 3, 5});
+  test.AddOutput<int32_t>("Z", {6}, {0, 1, 5, 0, -1, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, UInt32_mod) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddInput<uint32_t>("X", {6}, {4, 7, 5, 4, 7, 8});
+  test.AddInput<uint32_t>("Y", {6}, {2, 3, 8, 2, 3, 5});
+  test.AddOutput<uint32_t>("Z", {6}, {0, 1, 5, 0, 1, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int64_mixed_sign) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddInput<int64_t>("X", {6}, {-4, 7, 5, 4, -7, 8});
+  test.AddInput<int64_t>("Y", {6}, {2, -3, 8, -2, 3, 5});
+  test.AddOutput<int64_t>("Z", {6}, {0, -2, 5, 0, 2, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int64_mixed_sign_fmod) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddAttribute<int64_t>("fmod", 1);
+
+  test.AddInput<int64_t>("X", {6}, {-4, 7, 5, 4, -7, 8});
+  test.AddInput<int64_t>("Y", {6}, {2, -3, 8, -2, 3, 5});
+  test.AddOutput<int64_t>("Z", {6}, {0, 1, 5, 0, -1, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, UInt64_mod) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddInput<uint64_t>("X", {6}, {4, 7, 5, 4, 7, 8});
+  test.AddInput<uint64_t>("Y", {6}, {2, 3, 8, 2, 3, 5});
+  test.AddOutput<uint64_t>("Z", {6}, {0, 1, 5, 0, 1, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int32_mod_bcast) {
+  OpTester test("Mod", ModOp_ver);
+
+  std::vector<int32_t> input_sequence;
+  input_sequence.resize(30);
+  std::generate(input_sequence.begin(), input_sequence.end(),
+                [n = 0]() mutable { return n++; });
+
+  // input [0..29]
+  test.AddInput<int32_t>("X", {3, 2, 5}, input_sequence);
+  test.AddInput<int32_t>("Y", {1}, {7});
+
+  test.AddOutput<int32_t>("Z", {3, 2, 5},
+                          {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1});
+
+  test.Run();
+}
+
+}  // namespace test
 }  // namespace onnxruntime
--- a/onnxruntime/test/python/onnx_backend_test_series.py
+++ b/onnxruntime/test/python/onnx_backend_test_series.py
@ -89,10 +89,7 @@ backend_test.exclude(r'('
 '|^test_resize_nearest_cpu.*'
 '|^test_resize_upsample_linear_cpu.*'
 '|^test_resize_upsample_nearest_cpu.*'
-'|^test_mod_bcast.*'
-'|^test_mod_float_mixed_sign_example.*'
-'|^test_mod_fmod_mixed_sign_example.*'
-'|^test_mod_int64_mixed_sign_example.*'
+'|^test_mod_float_mixed_sign_example*'
 '|^test_reversesequence_batch_cpu.*'
 '|^test_reversesequence_time_cpu.*'
 '|^test_roialign_cpu.*'