diff --git a/onnxruntime/contrib_ops/cpu/activations.cc b/onnxruntime/contrib_ops/cpu/activations.cc
index f7f3d048cb..6a9fbe52ef 100644
--- a/onnxruntime/contrib_ops/cpu/activations.cc
+++ b/onnxruntime/contrib_ops/cpu/activations.cc
@@ -34,19 +34,5 @@ ONNX_OPERATOR_KERNEL_EX(
     KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
     Gelu<float>);
 
-ONNX_OPERATOR_KERNEL_EX(
-    FastGelu,
-    kMSDomain,
-    1,
-    kCpuExecutionProvider,
-    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
-    FastGelu<float>);
-
-template<typename T>
-constexpr float FastGelu<T>::kAlpha;
-
-template<typename T>
-constexpr float FastGelu<T>::kGamma;
-
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/activations.h b/onnxruntime/contrib_ops/cpu/activations.h
index 541016396d..f127ea2fba 100644
--- a/onnxruntime/contrib_ops/cpu/activations.h
+++ b/onnxruntime/contrib_ops/cpu/activations.h
@@ -73,68 +73,5 @@ class Gelu : public OpKernel {
   }
 };
 
-template <typename T>
-class FastGelu : public OpKernel {
- public:
-  FastGelu(const OpKernelInfo& info) : OpKernel(info) {}
-
-  Status Compute(OpKernelContext* context) const override {
-    const auto* X = context->Input<Tensor>(0);
-    Tensor* Y = context->Output(0, X->Shape());
-    concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
-    if (nullptr != tp) {
-      const T* input = X->template Data<T>();
-      T* output = Y->template MutableData<T>();
-      int task_count = tp->NumThreads() + 1;
-      int64_t elem_count = X->Shape().Size();
-      const auto coefficient = kAlpha * kGamma;
-      if (elem_count > task_count) {
-        tp->SimpleParallelFor(static_cast<std::ptrdiff_t>(task_count), [ input,
-                                      output,
-                                      elem_count,
-                                      task_count,
-                                      kAlpha = this->kAlpha,
-                                      coefficient ](std::ptrdiff_t i) {
-          int64_t elem_inx_start = i * elem_count / task_count;
-          int64_t elem_inx_end = (i + 1) * elem_count / task_count;
-          for (int64_t elem_inx = elem_inx_start; elem_inx < elem_inx_end; elem_inx++) {
-            const auto x = input[elem_inx];
-            output[elem_inx] = x * (coefficient * x * x  + kAlpha);
-            output[elem_inx] = 0.5f * x * (1.0f + tanh(output[elem_inx]));
-          }
-        });
-        return Status::OK();
-      }
-    }
-
-    //
-    // Commented out EIGEN implentation due to EIGEN bug.
-    // On Windows build with GPU enabled, kGamma * x_pow_3 + EIGEN_X below would produce incorrect
-    // result, same issue discovered in fast_gelu_grad op.
-    // Given that CPU kernel is mostly for conformance check, where performance is not of high
-    // priority, to workaround this bug, use a for loop and avoid using EIGEN library.
-    //
-    // const auto x_pow_3 = EIGEN_X.cube();
-    // const auto tanh_result = (kAlpha * (kGamma * x_pow_3 + EIGEN_X)).tanh();
-
-    // EIGEN_Y = 0.5f * EIGEN_X * (1.f + tanh_result);
-
-    const T* input = X->template Data<T>();
-    T* output = Y->template MutableData<T>();
-    int64_t elem_count = X->Shape().Size();
-    for (auto i = 0; i < elem_count; ++i) {
-      const auto x_val = input[i];
-      const auto x_cube = x_val * x_val * x_val;
-      T tanh_result = std::tanh(kAlpha * x_val + kAlpha * kGamma * x_cube);
-      output[i] = 0.5f * x_val * (tanh_result + 1.0f);
-    }
-    return Status::OK();
-  }
-
- private:
-  static constexpr float kAlpha = static_cast<float>(M_2_SQRTPI * M_SQRT1_2);
-  static constexpr float kGamma = 0.044715f;
-};
-
 }  // namespace contrib
 }  // namespace onnxruntime