diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index 891e8566a6..11e251de43 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -241,6 +241,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, St
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, TopK);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, MaxPool);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, AveragePool);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, Mod);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, float, Resize);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int32_t, Resize);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t, Resize);
@@ -501,6 +502,7 @@ void RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, TopK)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, Mod)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, float, Resize)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int32_t, Resize)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t, Resize)>,
diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
index 187c83d084..a19038ec7b 100644
--- a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
+++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
@@ -3,6 +3,9 @@
 
 #include "core/providers/cpu/math/element_wise_ops.h"
 #include <unsupported/Eigen/SpecialFunctions>
+#include "core/util/math.h"
+
+#include <cmath>
 
 namespace onnxruntime {
 
@@ -1012,7 +1015,7 @@ REG_EXPAND_KERNEL(bool)
 REG_EXPAND_KERNEL(MLFloat16)
 
 #ifndef DISABLE_CONTRIB_OPS
-namespace contrib{
+namespace contrib {
 template <>
 Status Scale<float>::Compute(OpKernelContext* ctx) const {
   auto& X = *ctx->Input<Tensor>(0);
@@ -1020,7 +1023,7 @@ Status Scale<float>::Compute(OpKernelContext* ctx) const {
   EigenMap<float>(Y) = scale_ * EigenMap<float>(X);
   return Status::OK();
 }
-}
+}  // namespace contrib
 #endif
 
 template <>
@@ -1034,4 +1037,226 @@ Status Erf<float>::Compute(OpKernelContext* context) const {
   return Status::OK();
 }
 
+class Mod final : public OpKernel {
+ public:
+  Mod(const OpKernelInfo& info) : OpKernel(info) {
+    int64_t fmod = 0;
+    Status s = info.GetAttr<int64_t>("fmod", &fmod);
+    if (s.IsOK()) {
+      ORT_ENFORCE((fmod == 0) || (fmod == 1), "fmod must have value either 0 or 1");
+      fmod_ = (fmod == 1);
+    }
+  }
+
+  Status Compute(OpKernelContext* context) const override;
+
+ private:
+  bool fmod_{false};
+};
+
+ONNX_CPU_OPERATOR_KERNEL(
+    Mod,
+    10,
+    KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                                            DataTypeImpl::GetTensorType<double>(),
+                                            DataTypeImpl::GetTensorType<int64_t>(),
+                                            DataTypeImpl::GetTensorType<uint64_t>(),
+                                            DataTypeImpl::GetTensorType<int32_t>(),
+                                            DataTypeImpl::GetTensorType<uint32_t>(),
+                                            DataTypeImpl::GetTensorType<int16_t>(),
+                                            DataTypeImpl::GetTensorType<uint16_t>(),
+                                            DataTypeImpl::GetTensorType<int8_t>(),
+                                            DataTypeImpl::GetTensorType<uint8_t>(),
+                                            DataTypeImpl::GetTensorType<MLFloat16>()}),
+    Mod);
+
+namespace mod_internal {
+
+template <class T>
+void BroadCastFMod(const Tensor& X, const Tensor& Y, OpKernelContext* context) {
+  TBroadcaster<T, T> mod_broadcaster{X, Y};
+  Tensor* const output = context->Output(0, mod_broadcaster.GetOutputShape());
+  ORT_ENFORCE(output, "failed to get first output!");
+  TBroadcastOutput<T> mod_broadcast_output{
+      mod_broadcaster.GetSpanSize(), *output};
+
+  BroadcastLoopSpan(
+      mod_broadcaster, mod_broadcast_output,
+      [](gsl::span<T> output, const T& X, gsl::span<const T> Y) {
+        std::transform(Y.cbegin(), Y.cend(), output.begin(),
+                       [X](auto y) {
+                         return static_cast<T>(std::fmod(X, y));
+                       });
+      },
+      [](gsl::span<T> output, gsl::span<const T> X, const T& Y) {
+        std::transform(X.cbegin(), X.cend(), output.begin(),
+                       [Y](auto x) {
+                         return static_cast<T>(std::fmod(x, Y));
+                       });
+      },
+      [](gsl::span<T> output, gsl::span<const T> X, gsl::span<const T> Y) {
+        std::transform(
+            X.cbegin(), X.cend(), Y.cbegin(), output.begin(),
+            [](auto x, auto y) {
+              return static_cast<T>(std::fmod(x, y));
+            });
+      });
+}
+
+template <class T>
+inline T Modulus(T x, T y) {
+  auto res = x % y;
+  if ((res < 0 && y > 0) || (res > 0 && y < 0)) {
+    res += y;
+  }
+  return static_cast<T>(res);
+}
+
+template <class T>
+void BroadCastMod(const Tensor& X, const Tensor& Y, OpKernelContext* context) {
+  TBroadcaster<T, T> mod_broadcaster{X, Y};
+  Tensor* const output = context->Output(0, mod_broadcaster.GetOutputShape());
+  ORT_ENFORCE(output, "failed to get first output!");
+  TBroadcastOutput<T> mod_broadcast_output{
+      mod_broadcaster.GetSpanSize(), *output};
+
+  // static_cast below are necessary when small types such as
+  // int16_t and int8_t are converted to integers to perform remainder
+  // operation. This cast is safe with respect to data loss.
+  BroadcastLoopSpan(
+      mod_broadcaster, mod_broadcast_output,
+      [](gsl::span<T> output, const T& X, gsl::span<const T> Y) {
+        std::transform(Y.cbegin(), Y.cend(), output.begin(),
+                       [X](auto y) {
+                         return Modulus(X, y);
+                       });
+      },
+      [](gsl::span<T> output, gsl::span<const T> X, const T& Y) {
+        std::transform(X.cbegin(), X.cend(), output.begin(),
+                       [Y](auto x) {
+                         return Modulus(x, Y);
+                       });
+      },
+      [](gsl::span<T> output, gsl::span<const T> X, gsl::span<const T> Y) {
+        std::transform(
+            X.cbegin(), X.cend(), Y.cbegin(), output.begin(),
+            [](auto x, auto y) {
+              return Modulus(x, y);
+            });
+      });
+}
+
+void BroadCastMFloat16FMod(const Tensor& X, const Tensor& Y, OpKernelContext* context) {
+  TBroadcaster<MLFloat16, MLFloat16> mod_broadcaster{X, Y};
+  Tensor* const output = context->Output(0, mod_broadcaster.GetOutputShape());
+  ORT_ENFORCE(output, "failed to get first output!");
+  TBroadcastOutput<MLFloat16> mod_broadcast_output{
+      mod_broadcaster.GetSpanSize(), *output};
+
+  BroadcastLoopSpan(
+      mod_broadcaster, mod_broadcast_output,
+      [](gsl::span<MLFloat16> output, const MLFloat16& X, gsl::span<const MLFloat16> Y) {
+        std::transform(Y.cbegin(), Y.cend(), output.begin(),
+                       [X_fl = math::halfToFloat(X.val)](const MLFloat16& y) {
+                         return MLFloat16(math::floatToHalf(std::fmod(X_fl, math::halfToFloat(y.val))));
+                       });
+      },
+      [](gsl::span<MLFloat16> output, gsl::span<const MLFloat16> X, const MLFloat16& Y) {
+        std::transform(X.cbegin(), X.cend(), output.begin(),
+                       [Y_fl = math::halfToFloat(Y.val)](const MLFloat16& x) {
+                         return MLFloat16(math::floatToHalf(std::fmod(math::halfToFloat(x.val), Y_fl)));
+                       });
+      },
+      [](gsl::span<MLFloat16> output, gsl::span<const MLFloat16> X, gsl::span<const MLFloat16> Y) {
+        std::transform(
+            X.cbegin(), X.cend(), Y.cbegin(), output.begin(),
+            [](const MLFloat16& x, const MLFloat16& y) {
+              auto x_fl = math::halfToFloat(x.val);
+              auto y_fl = math::halfToFloat(y.val);
+              return MLFloat16(math::floatToHalf(std::fmod(x_fl, y_fl)));
+            });
+      });
+}
+
+}  // namespace mod_internal
+
+Status Mod::Compute(OpKernelContext* context) const {
+  Status s;
+
+  const auto& X = *context->Input<Tensor>(0);
+  const auto& Y = *context->Input<Tensor>(1);
+
+  auto dtype = X.DataType();
+  if (dtype != Y.DataType()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "X and Y input types do not match: ",
+                           dtype, " vs ", Y.DataType());
+  }
+
+  using namespace mod_internal;
+
+  if (dtype == DataTypeImpl::GetType<float>()) {
+    ORT_ENFORCE(fmod_, "fmod attribute must be true for float, float16 and double types");
+    BroadCastFMod<float>(X, Y, context);
+  } else if (dtype == DataTypeImpl::GetType<double>()) {
+    ORT_ENFORCE(fmod_, "fmod attribute must be true for float, float16 and double types");
+    BroadCastFMod<double>(X, Y, context);
+  } else if (dtype == DataTypeImpl::GetType<MLFloat16>()) {
+    ORT_ENFORCE(fmod_, "fmod attribute must be true for float, float16 and double types");
+    BroadCastMFloat16FMod(X, Y, context);
+  } else if (dtype == DataTypeImpl::GetType<uint8_t>()) {
+    if (fmod_) {
+      BroadCastFMod<uint8_t>(X, Y, context);
+    } else {
+      BroadCastMod<uint8_t>(X, Y, context);
+    }
+  } else if (dtype == DataTypeImpl::GetType<int8_t>()) {
+    if (fmod_) {
+      BroadCastFMod<int8_t>(X, Y, context);
+    } else {
+      BroadCastMod<int8_t>(X, Y, context);
+    }
+  } else if (dtype == DataTypeImpl::GetType<uint16_t>()) {
+    if (fmod_) {
+      BroadCastFMod<uint16_t>(X, Y, context);
+    } else {
+      BroadCastMod<uint16_t>(X, Y, context);
+    }
+  } else if (dtype == DataTypeImpl::GetType<int16_t>()) {
+    if (fmod_) {
+      BroadCastFMod<int16_t>(X, Y, context);
+    } else {
+      BroadCastMod<int16_t>(X, Y, context);
+    }
+  } else if (dtype == DataTypeImpl::GetType<uint32_t>()) {
+    if (fmod_) {
+      BroadCastFMod<uint32_t>(X, Y, context);
+    } else {
+      BroadCastMod<uint32_t>(X, Y, context);
+    }
+  } else if (dtype == DataTypeImpl::GetType<int32_t>()) {
+    if (fmod_) {
+      BroadCastFMod<int32_t>(X, Y, context);
+    } else {
+      BroadCastMod<int32_t>(X, Y, context);
+    }
+  } else if (dtype == DataTypeImpl::GetType<uint64_t>()) {
+    if (fmod_) {
+      BroadCastFMod<uint64_t>(X, Y, context);
+    } else {
+      BroadCastMod<uint64_t>(X, Y, context);
+    }
+  } else if (dtype == DataTypeImpl::GetType<int64_t>()) {
+    if (fmod_) {
+      BroadCastFMod<int64_t>(X, Y, context);
+    } else {
+      BroadCastMod<int64_t>(X, Y, context);
+    }
+  } else {
+    ORT_ENFORCE(false, "Unsupported data type", dtype);
+  }
+
+  return s;
+}  // namespace onnxruntime
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index b9451421a3..fc439d5290 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -236,11 +236,11 @@ int real_main(int argc, char* argv[], OrtEnv** p_env) {
     }
 
     std::unordered_set<std::string> cuda_flaky_tests = {
-      "fp16_inception_v1", "fp16_shufflenet", "fp16_tiny_yolov2"};
+        "fp16_inception_v1", "fp16_shufflenet", "fp16_tiny_yolov2"};
 
-#if (defined (_WIN32) && !defined(_WIN64)) || (defined(__GNUG__) && !defined(__LP64__))
+#if (defined(_WIN32) && !defined(_WIN64)) || (defined(__GNUG__) && !defined(__LP64__))
     //Minimize mem consumption
-    LoadTests (data_dirs, whitelisted_test_cases, per_sample_tolerance, relative_per_sample_tolerance, [&stat, &sf, enable_cuda, &cuda_flaky_tests] (ITestCase* l) {
+    LoadTests(data_dirs, whitelisted_test_cases, per_sample_tolerance, relative_per_sample_tolerance, [&stat, &sf, enable_cuda, &cuda_flaky_tests](ITestCase* l) {
       std::unique_ptr<ITestCase> test_case_ptr(l);
       if (enable_cuda && cuda_flaky_tests.find(l->GetTestCaseName()) != cuda_flaky_tests.end()) {
         return;
@@ -253,15 +253,14 @@ int real_main(int argc, char* argv[], OrtEnv** p_env) {
     });
 #else
     std::vector<ITestCase*> tests;
-    LoadTests(data_dirs, whitelisted_test_cases, per_sample_tolerance, relative_per_sample_tolerance, [&tests] (ITestCase* l) { tests.push_back(l); });
+    LoadTests(data_dirs, whitelisted_test_cases, per_sample_tolerance, relative_per_sample_tolerance, [&tests](ITestCase* l) { tests.push_back(l); });
     if (enable_cuda) {
       for (auto it = tests.begin(); it != tests.end();) {
         auto iter = cuda_flaky_tests.find((*it)->GetTestCaseName());
         if (iter != cuda_flaky_tests.end()) {
           delete *it;
           it = tests.erase(it);
-        }
-        else {
+        } else {
           ++it;
         }
       }
@@ -357,6 +356,7 @@ int real_main(int argc, char* argv[], OrtEnv** p_env) {
       {"tf_mobilenet_v1_1.0_224", "result mismatch"},
       {"mobilenetv2-1.0", "result mismatch"},
       {"mxnet_arcface", "result mismatch"},
+      {"mod_float_mixed_sign_example", "faulty test"}
   };
 
 #ifdef USE_CUDA
@@ -364,7 +364,7 @@ int real_main(int argc, char* argv[], OrtEnv** p_env) {
 #endif
   // clang-format on
 
-#if defined (_WIN32) && !defined(_WIN64)
+#if defined(_WIN32) && !defined(_WIN64)
   broken_tests["vgg19"] = "failed: bad allocation";
 #endif
 
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index bfe0e5c8f8..f56da177ef 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -4,6 +4,7 @@
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 #include "core/util/math.h"
+#include <algorithm>
 #include <cmath>
 
 namespace onnxruntime {
@@ -14,7 +15,7 @@ TEST(MathOpTest, Add_int32) {
   test.AddInput<int32_t>("A", {3}, {1, 2, 3});
   test.AddInput<int32_t>("B", {3}, {4, 5, 6});
   test.AddOutput<int32_t>("C", {3}, {5, 7, 9});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT parser: elementwise inputs must not be Int32
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT parser: elementwise inputs must not be Int32
 }
 
 TEST(MathOpTest, Add_int64) {
@@ -22,7 +23,7 @@ TEST(MathOpTest, Add_int64) {
   test.AddInput<int64_t>("A", {3}, {1, 2, 3});
   test.AddInput<int64_t>("B", {3}, {4, 5, 6});
   test.AddOutput<int64_t>("C", {3}, {5, 7, 9});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: INT64 is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: INT64 is not supported
 }
 
 TEST(MathOpTest, Add) {
@@ -68,7 +69,7 @@ TEST(MathOpTest, Add_Broadcast_0x0) {
   test.AddInput<float>("A", {}, {10.0f});
   test.AddInput<float>("B", {}, {2.0f});
   test.AddOutput<float>("C", {}, {12.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: dynamic shape is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: dynamic shape is not supported
 }
 
 TEST(MathOpTest, Add_Broadcast_0x1) {
@@ -77,7 +78,7 @@ TEST(MathOpTest, Add_Broadcast_0x1) {
   test.AddInput<float>("A", {}, {10.0f});
   test.AddInput<float>("B", {1}, {2.0f});
   test.AddOutput<float>("C", {1}, {12.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: dynamic shape is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: dynamic shape is not supported
 }
 
 TEST(MathOpTest, Add_Broadcast_1x0) {
@@ -86,7 +87,7 @@ TEST(MathOpTest, Add_Broadcast_1x0) {
   test.AddInput<float>("A", {1}, {10.0f});
   test.AddInput<float>("B", {}, {2.0f});
   test.AddOutput<float>("C", {1}, {12.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: dynamic shape is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: dynamic shape is not supported
 }
 
 TEST(MathOpTest, Add_Broadcast_1x1) {
@@ -133,7 +134,7 @@ TEST(MathOpTest, Add_Broadcast_2x1x4_1x3x1) {
                          211.0f, 212.0f, 213.0f, 214.0f,
                          221.0f, 222.0f, 223.0f, 224.0f,
                          231.0f, 232.0f, 233.0f, 234.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//Input batch size is inconsistent
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //Input batch size is inconsistent
 }
 
 TEST(MathOpTest, Add_Broadcast_2x1x1_3x4) {
@@ -153,7 +154,7 @@ TEST(MathOpTest, Add_Broadcast_2x1x1_3x4) {
                          211.0f, 212.0f, 213.0f, 214.0f,
                          221.0f, 222.0f, 223.0f, 224.0f,
                          231.0f, 232.0f, 233.0f, 234.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//Input batch size is inconsistent
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //Input batch size is inconsistent
 }
 
 TEST(MathOpTest, Sub_int32) {
@@ -161,7 +162,7 @@ TEST(MathOpTest, Sub_int32) {
   test.AddInput<int32_t>("A", {3}, {1, 4, 3});
   test.AddInput<int32_t>("B", {3}, {4, 2, 4});
   test.AddOutput<int32_t>("C", {3}, {-3, 2, -1});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT parser:elementwise inputs must not be Int32
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT parser:elementwise inputs must not be Int32
 }
 
 TEST(MathOpTest, Sub_int64) {
@@ -169,7 +170,7 @@ TEST(MathOpTest, Sub_int64) {
   test.AddInput<int64_t>("A", {3}, {1, 5, 6});
   test.AddInput<int64_t>("B", {3}, {4, 5, 3});
   test.AddOutput<int64_t>("C", {3}, {-3, 0, 3});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: INT64 is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: INT64 is not supported
 }
 
 TEST(MathOpTest, Sub) {
@@ -202,7 +203,7 @@ TEST(MathOpTest, Sub_Broadcast_Scalar) {
                         {-4.0f, -3.0f, -6.0f,
                          -5.0f, -3.5f, -105.0f,
                          -10.4f, 4.3f, -10'005.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: dynamic shape is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: dynamic shape is not supported
 }
 
 TEST(MathOpTest, Mul_int32) {
@@ -210,7 +211,7 @@ TEST(MathOpTest, Mul_int32) {
   test.AddInput<int32_t>("A", {3}, {1, 2, 3});
   test.AddInput<int32_t>("B", {3}, {4, -3, 6});
   test.AddOutput<int32_t>("C", {3}, {4, -6, 18});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT parser:elementwise inputs must not be Int32
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT parser:elementwise inputs must not be Int32
 }
 
 TEST(MathOpTest, Mul_int64) {
@@ -218,7 +219,7 @@ TEST(MathOpTest, Mul_int64) {
   test.AddInput<int64_t>("A", {3}, {3, 6, -3});
   test.AddInput<int64_t>("B", {3}, {4, -3, -2});
   test.AddOutput<int64_t>("C", {3}, {12, -18, 6});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: INT64 is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: INT64 is not supported
 }
 
 TEST(MathOpTest, Mul) {
@@ -244,7 +245,7 @@ TEST(MathOpTest, Div_int32) {
   test.AddInput<int32_t>("A", {3}, {4, 8, 8});
   test.AddInput<int32_t>("B", {3}, {1, 3, 2});
   test.AddOutput<int32_t>("C", {3}, {4, 2, 4});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT parser:elementwise inputs must not be Int32
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT parser:elementwise inputs must not be Int32
 }
 
 TEST(MathOpTest, Div_int64) {
@@ -252,7 +253,7 @@ TEST(MathOpTest, Div_int64) {
   test.AddInput<int64_t>("A", {3}, {4, 8, 8});
   test.AddInput<int64_t>("B", {3}, {2, 3, 4});
   test.AddOutput<int64_t>("C", {3}, {2, 2, 2});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: INT64 is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: INT64 is not supported
 }
 
 TEST(MathOpTest, Div) {
@@ -291,7 +292,7 @@ TEST(MathOpTest, Abs_int32) {
   std::vector<int64_t> dims{4};
   test.AddInput<int32_t>("X", dims, {1, 2, -1, -5});
   test.AddOutput<int32_t>("Y", dims, {1, 2, 1, 5});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT parser: Int32 not allowed as input to this layer
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT parser: Int32 not allowed as input to this layer
 }
 
 TEST(MathOpTest, Neg) {
@@ -319,7 +320,7 @@ TEST(MathOpTest, Neg_int32) {
   std::vector<int64_t> dims{4};
   test.AddInput<int32_t>("X", dims, {1, -2, 0, -10});
   test.AddOutput<int32_t>("Y", dims, {-1, 2, 0, 10});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT parser: Int32 not allowed as input to this layer
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT parser: Int32 not allowed as input to this layer
 }
 
 TEST(MathOpTest, Floor) {
@@ -392,7 +393,7 @@ TEST(MathOpTest, Pow_Broadcast_Scalar0) {
   test.AddInput<float>("X", {}, {2.0f});
   test.AddInput<float>("Y", dims, {1.0f, 2.0f, 3.0f});
   test.AddOutput<float>("Z", dims, {2.0f, 4.0f, 8.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: dynamic shape is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: dynamic shape is not supported
 }
 
 TEST(MathOpTest, Pow_Broadcast_Scalar1) {
@@ -402,7 +403,7 @@ TEST(MathOpTest, Pow_Broadcast_Scalar1) {
   test.AddInput<float>("X", dims, {1.0f, 2.0f, 3.0f});
   test.AddInput<float>("Y", {}, {2.0f});
   test.AddOutput<float>("Z", dims, {1.0f, 4.0f, 9.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//TensorRT: dynamic shape is not supported
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: dynamic shape is not supported
 }
 
 TEST(MathOpTest, Exp) {
@@ -469,7 +470,7 @@ TEST(MathOpTest, Sum_8_Test1) {
                          311.0f, 312.0f, 313.0f,
                          321.0f, 322.0f, 323.0f,
                          331.0f, 332.0f, 333.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});// TensorRT parser failed on this test
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT parser failed on this test
 }
 
 TEST(MathOpTest, Sum_8_Test2) {
@@ -581,7 +582,7 @@ TEST(MathOpTest, Max_8) {
                         {10.0f, 20.0f, 30.0f,
                          40.0f, 50.0f, 60.0f,
                          300.0f, 300.0f, 300.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//Input batch size is inconsistent
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //Input batch size is inconsistent
 }
 
 TEST(MathOpTest, Not) {
@@ -757,7 +758,7 @@ TEST(MathOpTest, Mean_8) {
                         {12.0f / 3.0f, 22.0f / 3.0f, 32.0f / 3.0f,
                          43.0f / 3.0f, 53.0f / 3.0f, 63.0f / 3.0f,
                          74.0f / 3.0f, 84.0f / 3.0f, 94.0f / 3.0f});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//Input batch size is inconsistent
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //Input batch size is inconsistent
 }
 
 #ifndef DISABLE_CONTRIB_OPS
@@ -1008,6 +1009,173 @@ TEST(MathOpTest, Erf) {
   test.AddOutput<float>("B", dims, {0.5204999f, 0.8427008f, 0.6778012f, 0.9953223f});
   test.Run();
 }
-}  // namespace test
 
+const int ModOp_ver = 10;
+
+TEST(ModOpTest, Fmod_float_mixed_sign) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddAttribute<int64_t>("fmod", 1);
+  test.AddInput<float>("X", {6}, {-4.3f, 7.2f, 5.0f, 4.3f, -7.2f, 8.0f});
+  test.AddInput<float>("Y", {6}, {2.1f, -3.4f, 8.0f, -2.1f, 3.4f, 5.0f});
+  test.AddOutput<float>("Z", {6}, {-0.1f, 0.4f, 5.f, 0.1f, -0.4f, 3.f});
+
+  test.Run();
+}
+
+std::vector<MLFloat16> MakeMLFloat16(const std::initializer_list<float>& input) {
+  std::vector<MLFloat16> output;
+  std::transform(input.begin(), input.end(), std::back_inserter(output),
+                 [](float fl) {
+                   return MLFloat16(math::floatToHalf(fl));
+                 });
+  return output;
+}
+
+TEST(ModOpTest, Fmod_float16_mixed_sign) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddAttribute<int64_t>("fmod", 1);
+
+  test.AddInput<MLFloat16>("X", {6}, MakeMLFloat16({-4.3f, 7.2f, 5.0f, 4.3f, -7.2f, 8.0f}));
+  test.AddInput<MLFloat16>("Y", {6}, MakeMLFloat16({2.1f, -3.4f, 8.0f, -2.1f, 3.4f, 5.0f}));
+  // The output above is {-0.1f, 0.4f, 5.f, 0.1f, -0.4f, 3.f} for float
+  test.AddOutput<MLFloat16>("Z", {6}, MakeMLFloat16({-0.1015625f, 0.3984375f, 5.f, 0.1015625f, -0.3984375f, 3.f}));
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int8_mixed_sign) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddInput<int8_t>("X", {6}, {-4, 7, 5, 4, -7, 8});
+  test.AddInput<int8_t>("Y", {6}, {2, -3, 8, -2, 3, 5});
+  test.AddOutput<int8_t>("Z", {6}, {0, -2, 5, 0, 2, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int8_mixed_sign_fmod) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddAttribute<int64_t>("fmod", 1);
+
+  test.AddInput<int8_t>("X", {6}, {-4, 7, 5, 4, -7, 8});
+  test.AddInput<int8_t>("Y", {6}, {2, -3, 8, -2, 3, 5});
+  test.AddOutput<int8_t>("Z", {6}, {0, 1, 5, 0, -1, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, UInt8_mod) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddInput<uint8_t>("X", {6}, {4, 7, 5, 4, 7, 8});
+  test.AddInput<uint8_t>("Y", {6}, {2, 3, 8, 2, 3, 5});
+  test.AddOutput<uint8_t>("Z", {6}, {0, 1, 5, 0, 1, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int16_mixed_sign) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddInput<int16_t>("X", {6}, {-4, 7, 5, 4, -7, 8});
+  test.AddInput<int16_t>("Y", {6}, {2, -3, 8, -2, 3, 5});
+  test.AddOutput<int16_t>("Z", {6}, {0, -2, 5, 0, 2, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int16_mixed_sign_fmod) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddAttribute<int64_t>("fmod", 1);
+
+  test.AddInput<int16_t>("X", {6}, {-4, 7, 5, 4, -7, 8});
+  test.AddInput<int16_t>("Y", {6}, {2, -3, 8, -2, 3, 5});
+  test.AddOutput<int16_t>("Z", {6}, {0, 1, 5, 0, -1, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, UInt16_mod) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddInput<uint16_t>("X", {6}, {4, 7, 5, 4, 7, 8});
+  test.AddInput<uint16_t>("Y", {6}, {2, 3, 8, 2, 3, 5});
+  test.AddOutput<uint16_t>("Z", {6}, {0, 1, 5, 0, 1, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int32_mixed_sign) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddInput<int32_t>("X", {6}, {-4, 7, 5, 4, -7, 8});
+  test.AddInput<int32_t>("Y", {6}, {2, -3, 8, -2, 3, 5});
+  test.AddOutput<int32_t>("Z", {6}, {0, -2, 5, 0, 2, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int32_mixed_sign_fmod) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddAttribute<int64_t>("fmod", 1);
+
+  test.AddInput<int32_t>("X", {6}, {-4, 7, 5, 4, -7, 8});
+  test.AddInput<int32_t>("Y", {6}, {2, -3, 8, -2, 3, 5});
+  test.AddOutput<int32_t>("Z", {6}, {0, 1, 5, 0, -1, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, UInt32_mod) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddInput<uint32_t>("X", {6}, {4, 7, 5, 4, 7, 8});
+  test.AddInput<uint32_t>("Y", {6}, {2, 3, 8, 2, 3, 5});
+  test.AddOutput<uint32_t>("Z", {6}, {0, 1, 5, 0, 1, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int64_mixed_sign) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddInput<int64_t>("X", {6}, {-4, 7, 5, 4, -7, 8});
+  test.AddInput<int64_t>("Y", {6}, {2, -3, 8, -2, 3, 5});
+  test.AddOutput<int64_t>("Z", {6}, {0, -2, 5, 0, 2, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int64_mixed_sign_fmod) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddAttribute<int64_t>("fmod", 1);
+
+  test.AddInput<int64_t>("X", {6}, {-4, 7, 5, 4, -7, 8});
+  test.AddInput<int64_t>("Y", {6}, {2, -3, 8, -2, 3, 5});
+  test.AddOutput<int64_t>("Z", {6}, {0, 1, 5, 0, -1, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, UInt64_mod) {
+  OpTester test("Mod", ModOp_ver);
+  test.AddInput<uint64_t>("X", {6}, {4, 7, 5, 4, 7, 8});
+  test.AddInput<uint64_t>("Y", {6}, {2, 3, 8, 2, 3, 5});
+  test.AddOutput<uint64_t>("Z", {6}, {0, 1, 5, 0, 1, 3});
+
+  test.Run();
+}
+
+TEST(ModOpTest, Int32_mod_bcast) {
+  OpTester test("Mod", ModOp_ver);
+
+  std::vector<int32_t> input_sequence;
+  input_sequence.resize(30);
+  std::generate(input_sequence.begin(), input_sequence.end(),
+                [n = 0]() mutable { return n++; });
+
+  // input [0..29]
+  test.AddInput<int32_t>("X", {3, 2, 5}, input_sequence);
+  test.AddInput<int32_t>("Y", {1}, {7});
+
+  test.AddOutput<int32_t>("Z", {3, 2, 5},
+                          {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1});
+
+  test.Run();
+}
+
+}  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py
index b64849db35..9cabd5ca14 100644
--- a/onnxruntime/test/python/onnx_backend_test_series.py
+++ b/onnxruntime/test/python/onnx_backend_test_series.py
@@ -89,10 +89,7 @@ backend_test.exclude(r'('
 '|^test_resize_nearest_cpu.*'
 '|^test_resize_upsample_linear_cpu.*'
 '|^test_resize_upsample_nearest_cpu.*'
-'|^test_mod_bcast.*'
-'|^test_mod_float_mixed_sign_example.*'
-'|^test_mod_fmod_mixed_sign_example.*'
-'|^test_mod_int64_mixed_sign_example.*'
+'|^test_mod_float_mixed_sign_example*'
 '|^test_reversesequence_batch_cpu.*'
 '|^test_reversesequence_time_cpu.*'
 '|^test_roialign_cpu.*'