diff --git a/onnxruntime/core/providers/cuda/shared_inc/fast_divmod.h b/onnxruntime/core/providers/cuda/shared_inc/fast_divmod.h index 55a4daa66b..89a4a0c1d5 100644 --- a/onnxruntime/core/providers/cuda/shared_inc/fast_divmod.h +++ b/onnxruntime/core/providers/cuda/shared_inc/fast_divmod.h @@ -34,7 +34,7 @@ struct fast_divmod { } __host__ __device__ inline int div(int n) const { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) uint32_t t = __umulhi(M_, n); return (t + n) >> l_; #else diff --git a/onnxruntime/core/providers/rocm/shared_inc/fast_divmod.h b/onnxruntime/core/providers/rocm/shared_inc/fast_divmod.h deleted file mode 100644 index dce3e83a55..0000000000 --- a/onnxruntime/core/providers/rocm/shared_inc/fast_divmod.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include -#include -#include -#include -#include "core/common/common.h" - -namespace onnxruntime { -namespace rocm { - -// The code below is based on section 4 Unsigned division of paper https://gmplib.org/~tege/divcnst-pldi94.pdf -// In current ORT, fast_divmod is used for calculating the position of a element in tensor, -// so unsigned integer division from the paper is good enough for ORT. The advantage is that div is very simple, -// then GPU compiler can do loop unroll easilly when divmod is called in a loop. -struct fast_divmod { - fast_divmod(int d = 1) { - d_ = d == 0 ? 1 : d; - ORT_ENFORCE(d_ >= 1 && d_ <= static_cast(std::numeric_limits::max())); - - for (l_ = 0; l_ < 32; l_++) if ((1U << l_) >= d_) break; - - uint64_t one = 1; - uint64_t m = ((one << 32) * ((one << l_) - d_)) / d_ + 1; - M_ = static_cast(m); - // according to paper, the value of m' should fit in a unsigned integer. - ORT_ENFORCE(M_ > 0 && M_ == m); - } - - __host__ __device__ inline int div(int n) const { -#ifdef __HIP_DEVICE_COMPILE__ - uint32_t t = __umulhi(M_, n); - return (t + n) >> l_; -#else - // Using uint64_t for t, then t + n won't overflow. - uint64_t t = ((uint64_t) M_ * n) >> 32; - return static_cast((t + n) >> l_); -#endif - } - - __host__ __device__ inline int mod(int n) const { - return n - div(n) * d_; - } - - __host__ __device__ inline void divmod(int n, int& q, int& r) const { - q = div(n); - r = n - q * d_; - } - - uint32_t d_; // divisor - uint32_t M_; // m' in the paper. - uint32_t l_; // l_ = ceil(log2(d_)) -}; - -} // namespace rocm -} // namespace onnxruntime diff --git a/tools/ci_build/amd_hipify.py b/tools/ci_build/amd_hipify.py index f9e40965e0..2d2ff17a04 100644 --- a/tools/ci_build/amd_hipify.py +++ b/tools/ci_build/amd_hipify.py @@ -131,7 +131,6 @@ provider_excluded_files = [ 'rnn/rnn_impl.cu', 'rnn/rnn_impl.h', 'shared_inc/cuda_call.h', - 'shared_inc/fast_divmod.h', 'shared_inc/fpgeneric.h', 'shared_inc/integer_gemm.h', 'tensor/gather_nd_impl.cu',