Remove FastGelu from activations.

This commit is contained in:
Sergii Dymchenko 2020-04-09 19:17:54 -07:00
parent 507d2bb9b9
commit bb2f427990
2 changed files with 0 additions and 77 deletions

View file

@ -34,19 +34,5 @@ ONNX_OPERATOR_KERNEL_EX(
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
Gelu<float>);
ONNX_OPERATOR_KERNEL_EX(
FastGelu,
kMSDomain,
1,
kCpuExecutionProvider,
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
FastGelu<float>);
template<typename T>
constexpr float FastGelu<T>::kAlpha;
template<typename T>
constexpr float FastGelu<T>::kGamma;
} // namespace contrib
} // namespace onnxruntime

View file

@ -73,68 +73,5 @@ class Gelu : public OpKernel {
}
};
template <typename T>
class FastGelu : public OpKernel {
public:
FastGelu(const OpKernelInfo& info) : OpKernel(info) {}
Status Compute(OpKernelContext* context) const override {
const auto* X = context->Input<Tensor>(0);
Tensor* Y = context->Output(0, X->Shape());
concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
if (nullptr != tp) {
const T* input = X->template Data<T>();
T* output = Y->template MutableData<T>();
int task_count = tp->NumThreads() + 1;
int64_t elem_count = X->Shape().Size();
const auto coefficient = kAlpha * kGamma;
if (elem_count > task_count) {
tp->SimpleParallelFor(static_cast<std::ptrdiff_t>(task_count), [ input,
output,
elem_count,
task_count,
kAlpha = this->kAlpha,
coefficient ](std::ptrdiff_t i) {
int64_t elem_inx_start = i * elem_count / task_count;
int64_t elem_inx_end = (i + 1) * elem_count / task_count;
for (int64_t elem_inx = elem_inx_start; elem_inx < elem_inx_end; elem_inx++) {
const auto x = input[elem_inx];
output[elem_inx] = x * (coefficient * x * x + kAlpha);
output[elem_inx] = 0.5f * x * (1.0f + tanh(output[elem_inx]));
}
});
return Status::OK();
}
}
//
// Commented out EIGEN implentation due to EIGEN bug.
// On Windows build with GPU enabled, kGamma * x_pow_3 + EIGEN_X below would produce incorrect
// result, same issue discovered in fast_gelu_grad op.
// Given that CPU kernel is mostly for conformance check, where performance is not of high
// priority, to workaround this bug, use a for loop and avoid using EIGEN library.
//
// const auto x_pow_3 = EIGEN_X.cube();
// const auto tanh_result = (kAlpha * (kGamma * x_pow_3 + EIGEN_X)).tanh();
// EIGEN_Y = 0.5f * EIGEN_X * (1.f + tanh_result);
const T* input = X->template Data<T>();
T* output = Y->template MutableData<T>();
int64_t elem_count = X->Shape().Size();
for (auto i = 0; i < elem_count; ++i) {
const auto x_val = input[i];
const auto x_cube = x_val * x_val * x_val;
T tanh_result = std::tanh(kAlpha * x_val + kAlpha * kGamma * x_cube);
output[i] = 0.5f * x_val * (tanh_result + 1.0f);
}
return Status::OK();
}
private:
static constexpr float kAlpha = static_cast<float>(M_2_SQRTPI * M_SQRT1_2);
static constexpr float kGamma = 0.044715f;
};
} // namespace contrib
} // namespace onnxruntime