From c4b6559be93621f919086788fdcd9969a841d69b Mon Sep 17 00:00:00 2001 From: Jesse Benson Date: Tue, 12 Jan 2021 18:18:57 -0800 Subject: [PATCH] Update reduction_all.cu --- .../training_ops/rocm/reduction/reduction_all.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cu b/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cu index 346e093bd4..d6477ebf08 100644 --- a/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cu +++ b/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cu @@ -12,12 +12,12 @@ namespace onnxruntime { namespace rocm { -template +template __global__ void ScalarSqrtKernel(Tin* input, Tout* output) { *output = (Tout)_Sqrt(*input); } -template +template void ScalarSqrt(Tin* input, Tout* output) { hipLaunchKernelGGL(ScalarSqrtKernel, dim3(1), dim3(1), 0, 0, input, output); } @@ -61,7 +61,7 @@ __global__ void MultiTensorReduceKernel(ChunkGroup<1> chunk_group, TOut* output) const int wid = threadIdx.x / GPU_WARP_SIZE; // Shape is 2 x warp_count_in_block. - HIP_DYNAMIC_SHARED( unsigned char, shared_memory_) + extern __shared__ unsigned char shared_memory_[]; TBuf* shared_memory = reinterpret_cast(shared_memory_); if (lid == 0) { @@ -79,7 +79,7 @@ __global__ void MultiTensorReduceKernel(ChunkGroup<1> chunk_group, TOut* output) } if (threadIdx.x == 0) { - atomic_add(w_norm, TOutOp()(shared_memory[0])); + atomic_add(w_norm, TOutOp()(TOut(shared_memory[0]))); } } @@ -100,7 +100,7 @@ void MultiTensorReduce(ChunkGroup<1> chunk_group, TOut* output) { template void MultiTensorReduceL2::operator()(ChunkGroup<1> chunk_group, TOut* output) { using TBuf = AccumulationType_t; - MultiTensorReduce, Cast>(chunk_group, output); + MultiTensorReduce(chunk_group, output); } #define INSTANTIATE_MULTI_TENSOR_REDUCTION_L2_FUNCTOR(TIn, TOut) \