diff --git a/onnxruntime/contrib_ops/cpu/activations.cc b/onnxruntime/contrib_ops/cpu/activations.cc index f7f3d048cb..6a9fbe52ef 100644 --- a/onnxruntime/contrib_ops/cpu/activations.cc +++ b/onnxruntime/contrib_ops/cpu/activations.cc @@ -34,19 +34,5 @@ ONNX_OPERATOR_KERNEL_EX( KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), Gelu); -ONNX_OPERATOR_KERNEL_EX( - FastGelu, - kMSDomain, - 1, - kCpuExecutionProvider, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - FastGelu); - -template -constexpr float FastGelu::kAlpha; - -template -constexpr float FastGelu::kGamma; - } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/activations.h b/onnxruntime/contrib_ops/cpu/activations.h index 541016396d..f127ea2fba 100644 --- a/onnxruntime/contrib_ops/cpu/activations.h +++ b/onnxruntime/contrib_ops/cpu/activations.h @@ -73,68 +73,5 @@ class Gelu : public OpKernel { } }; -template -class FastGelu : public OpKernel { - public: - FastGelu(const OpKernelInfo& info) : OpKernel(info) {} - - Status Compute(OpKernelContext* context) const override { - const auto* X = context->Input(0); - Tensor* Y = context->Output(0, X->Shape()); - concurrency::ThreadPool* tp = context->GetOperatorThreadPool(); - if (nullptr != tp) { - const T* input = X->template Data(); - T* output = Y->template MutableData(); - int task_count = tp->NumThreads() + 1; - int64_t elem_count = X->Shape().Size(); - const auto coefficient = kAlpha * kGamma; - if (elem_count > task_count) { - tp->SimpleParallelFor(static_cast(task_count), [ input, - output, - elem_count, - task_count, - kAlpha = this->kAlpha, - coefficient ](std::ptrdiff_t i) { - int64_t elem_inx_start = i * elem_count / task_count; - int64_t elem_inx_end = (i + 1) * elem_count / task_count; - for (int64_t elem_inx = elem_inx_start; elem_inx < elem_inx_end; elem_inx++) { - const auto x = input[elem_inx]; - output[elem_inx] = x * (coefficient * x * x + kAlpha); - output[elem_inx] = 0.5f * x * (1.0f + tanh(output[elem_inx])); - } - }); - return Status::OK(); - } - } - - // - // Commented out EIGEN implentation due to EIGEN bug. - // On Windows build with GPU enabled, kGamma * x_pow_3 + EIGEN_X below would produce incorrect - // result, same issue discovered in fast_gelu_grad op. - // Given that CPU kernel is mostly for conformance check, where performance is not of high - // priority, to workaround this bug, use a for loop and avoid using EIGEN library. - // - // const auto x_pow_3 = EIGEN_X.cube(); - // const auto tanh_result = (kAlpha * (kGamma * x_pow_3 + EIGEN_X)).tanh(); - - // EIGEN_Y = 0.5f * EIGEN_X * (1.f + tanh_result); - - const T* input = X->template Data(); - T* output = Y->template MutableData(); - int64_t elem_count = X->Shape().Size(); - for (auto i = 0; i < elem_count; ++i) { - const auto x_val = input[i]; - const auto x_cube = x_val * x_val * x_val; - T tanh_result = std::tanh(kAlpha * x_val + kAlpha * kGamma * x_cube); - output[i] = 0.5f * x_val * (tanh_result + 1.0f); - } - return Status::OK(); - } - - private: - static constexpr float kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - static constexpr float kGamma = 0.044715f; -}; - } // namespace contrib } // namespace onnxruntime