diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc new file mode 100644 index 0000000000..cb58191c1d --- /dev/null +++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc @@ -0,0 +1,95 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/cuda/reduction/reduction_functions.h" + +#include +#include + +#include "core/common/optional.h" +#include "core/framework/tensor_shape.h" +#include "core/providers/common.h" + +namespace onnxruntime { +namespace cuda { + +ApplicableMatrixReduction get_applicable_matrix_reduction( + const cudnnReduceTensorOp_t cudnn_reduce_op, + const std::vector& dims, const std::vector& original_axes, + int& m_out, int& n_out) { + if (cudnn_reduce_op != CUDNN_REDUCE_TENSOR_ADD) { + return ApplicableMatrixReduction::None; + } + + const auto rank = gsl::narrow(dims.size()); + + // min and max of single contiguous range of axes + const auto minmax_axes = [&]() -> optional> { + // empty axes means reduce all dimensions + if (original_axes.empty()) { + return std::make_pair(int64_t{0}, rank - 1); + } + + // normalize axis values and sort + const std::vector axes = [&original_axes, rank]() { + std::vector result(original_axes); + std::for_each( + result.begin(), result.end(), + [rank](int64_t& axis) { axis = HandleNegativeAxis(axis, rank); }); + std::sort(result.begin(), result.end()); + return result; + }(); + + for (auto a = axes.begin(), b = axes.begin() + 1; + b != axes.end(); + ++a, ++b) { + ORT_ENFORCE(*a != *b, "axes must not contain duplicate values"); + if (*a + 1 != *b) { // not contiguous + return {}; + } + } + + return std::make_pair(axes.front(), axes.back()); + }(); + + if (!minmax_axes.has_value()) { + return ApplicableMatrixReduction::None; + } + + const auto& min_axis = minmax_axes.value().first; + const auto& max_axis = minmax_axes.value().second; + + // axes from beginning means row reduction, axes to end means column reduction + // currently we don't handle axes from beginning to end, but that could be either + const bool axes_from_beginning = min_axis == 0; + const bool axes_to_end = max_axis == rank - 1; + + // handle axes anchored to one of beginning or end, not both + if (axes_from_beginning == axes_to_end) { + return ApplicableMatrixReduction::None; + } + + const int64_t m_end_axis = axes_from_beginning ? max_axis + 1 : min_axis; + + const TensorShape& shape = TensorShape::ReinterpretBaseType(dims); + + const auto m = shape.SizeToDimension(m_end_axis); + const auto n = shape.SizeFromDimension(m_end_axis); + + ORT_ENFORCE(m > 0 && n > 0, "shape must not have negative dimensions: ", shape); + + if (m > std::numeric_limits::max() || + n > std::numeric_limits::max()) { + return ApplicableMatrixReduction::None; + } + + m_out = gsl::narrow_cast(m); + n_out = gsl::narrow_cast(n); + + return axes_from_beginning + ? ApplicableMatrixReduction::Rows + : ApplicableMatrixReduction::Columns; +} + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu index 785c55a8c2..80b0f6c723 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu +++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu @@ -1,51 +1,123 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include "core/providers/cuda/reduction/reduction_functions.h" + #include + #include #include -#include "core/providers/cuda/cu_inc/common.cuh" + +#include "core/common/common.h" #include "core/providers/cuda/atomic/common.cuh" -#include "reduction_functions.h" -#include "reduction_utils.cuh" - -#define NUM_ELEMENTS_PER_THREAD 4 -#define NUM_WARPS_PER_BLOCK 8 -#define MAX_NUM_BLOCKS 256 - -#define ALL_ONE_MASK 0xFFFFFFFF -#define ONE_MASK 0x00000001 +#include "core/providers/cuda/cu_inc/common.cuh" +#include "core/providers/cuda/shared_inc/cuda_utils.h" +#include "core/providers/cuda/reduction/reduction_utils.cuh" namespace onnxruntime { namespace cuda { -std::pair compute_block_size(int size) { - int x = GPU_WARP_SIZE; - int y = std::min(NUM_WARPS_PER_BLOCK, std::max(1, size / (NUM_ELEMENTS_PER_THREAD * GPU_WARP_SIZE))); - return std::make_pair(x, y); +namespace detail { +constexpr auto MAX_NUM_ELEMENTS_PER_THREAD = 4; +constexpr auto MAX_NUM_WARPS_PER_BLOCK = 8; +constexpr auto MAX_NUM_BLOCKS_IN_GRID_ROW = 256; +constexpr auto MAX_NUM_GRID_ROWS = 32768; + +dim3 compute_block_dim(int num_cols) { + const int x = GPU_WARP_SIZE; + const int y = std::min(MAX_NUM_WARPS_PER_BLOCK, std::max(1, num_cols / (MAX_NUM_ELEMENTS_PER_THREAD * x))); + return dim3(x, y); } -int compute_grid_size(int size) { - const auto block = compute_block_size(size); - return std::min(MAX_NUM_BLOCKS, std::max(1, size / (NUM_ELEMENTS_PER_THREAD * block.first * block.second))); +std::pair compute_grid_and_block_dims(int num_rows, int num_cols) { + const auto block_dim = compute_block_dim(num_cols); + const auto grid_x = + std::min( + MAX_NUM_BLOCKS_IN_GRID_ROW, + std::max(1, num_cols / (MAX_NUM_ELEMENTS_PER_THREAD * block_dim.x * block_dim.y))); + const auto grid_y = std::min(MAX_NUM_GRID_ROWS, num_rows); + const dim3 grid_dim(grid_x, grid_y); + return {grid_dim, block_dim}; } -int compute_reduction_buffer_size(int element_size, int size) { - const int num_blocks = compute_grid_size(size); - return static_cast(num_blocks * element_size + sizeof(int)); +uintptr_t round_up_to_aligned(uintptr_t original, size_t alignment) { + assert((alignment & (alignment - 1)) == 0); + const size_t alignment_mask = ~(alignment - 1); + return (original + alignment - 1) & alignment_mask; } -template -__global__ void reduce_all_kernel(const int size, const TIn * data, TOut* output, TOut* buffer) { - extern __shared__ unsigned char shared_memory_[]; - TOut* shared_memory = reinterpret_cast(shared_memory_); - // Thread-level indexes: +/** + * call_reduce_matrix_columns() intermediate buffer layout + * + * Given buffer element type TBuf, the intermediate buffer layout looks like this: + * + * ----- + * m * num_blocks_per_row * sizeof(TBuf) bytes for block reductions per row + * alignment padding bytes as needed + * m * sizeof(int) bytes for block done counts per row + * ----- + */ + +size_t compute_reduce_matrix_columns_intermediate_buffer_size( + int element_size, int num_rows, int num_cols) { + ORT_ENFORCE(element_size >= 0 && num_rows >= 0 && num_cols >= 0); + + const auto grid_dim = compute_grid_and_block_dims(num_rows, num_cols).first; + + size_t buffer_size{}; + + // at the beginning, for sizing purposes, assume we are aligned + buffer_size += static_cast(num_rows) * grid_dim.x * element_size; + + buffer_size = round_up_to_aligned(buffer_size, alignof(int)); + buffer_size += static_cast(num_rows) * sizeof(int); + + // add padding to give us room to align + buffer_size += alignof(max_align_t) - 1; + + return buffer_size; +} + +template +Status get_reduction_buffers( + int num_rows, int num_cols, void* buffer, size_t buffer_size, + TBuf*& block_reductions_buffer, int*& block_done_counts_buffer) { + const auto grid_dim = compute_grid_and_block_dims(num_rows, num_cols).first; + + const uintptr_t begin_addr = reinterpret_cast(buffer); + const uintptr_t block_reductions_addr = + round_up_to_aligned(begin_addr, alignof(TBuf)); + const uintptr_t block_done_counts_buffer_addr = + round_up_to_aligned( + block_reductions_addr + static_cast(num_rows) * grid_dim.x * sizeof(TBuf), alignof(int)); + const uintptr_t end_addr = + block_done_counts_buffer_addr + static_cast(num_rows) * sizeof(int); + const size_t required_size = end_addr - begin_addr; + + ORT_RETURN_IF_NOT( + required_size <= buffer_size, + "Buffer size is too small (", buffer_size, " bytes). ", + "At least ", required_size, " bytes are needed from the given base address (", buffer, ")."); + + block_reductions_buffer = reinterpret_cast(block_reductions_addr); + block_done_counts_buffer = reinterpret_cast(block_done_counts_buffer_addr); + + return Status::OK(); +} + +template +__device__ void reduce_all( + const int num_elements, const TIn* const input, TOut* const output, + TBuf* const block_reductions_buffer, int* const block_done_count_buffer) { + extern __shared__ unsigned char shared_memory_bytes[]; + TBuf* shared_memory = reinterpret_cast(shared_memory_bytes); + // Thread-level indices: // Linear index of thread in block. const int tid_in_block = threadIdx.y * blockDim.x + threadIdx.x; // Total number of threads in a 2-D block. const int num_threads_in_block = blockDim.x * blockDim.y; - // Warp-level indexes: + // Warp-level indices: // Warp index of thread. const int wid_in_block = tid_in_block / GPU_WARP_SIZE; // Lane index of thread. @@ -53,35 +125,35 @@ __global__ void reduce_all_kernel(const int size, const TIn * data, TOut* output // Warp count per block. const int num_warps_in_block = num_threads_in_block / GPU_WARP_SIZE; - // Grid-level indexes: - // Linear index of block in grid. - const int bid_in_grid = blockIdx.x + blockIdx.y * gridDim.x; - // Linear index of thread in grid. - const int tid_in_grid = bid_in_grid * (blockDim.x * blockDim.y) + tid_in_block; - // Total number of blocks in a 2-D grid. - const int num_blocks_in_grid = gridDim.x * gridDim.y; - // Total number of threads in a 2-D grid with 2-D blocks. - const int num_threads_in_grid = num_blocks_in_grid * num_threads_in_block; + // Grid-level indices: + // Linear index of block in grid row. + const int bid_in_grid_row = blockIdx.x; + // Linear index of thread in grid row. + const int tid_in_grid_row = bid_in_grid_row * (blockDim.x * blockDim.y) + tid_in_block; + // Total number of blocks in a grid row. + const int num_blocks_in_grid_row = gridDim.x; + // Total number of threads in a grid row with 2-D blocks. + const int num_threads_in_grid_row = num_blocks_in_grid_row * num_threads_in_block; // Thread-level reduction (storage change: global memory -> register). - // One thread reduces NUM_ELEMENTS_PER_THREAD elements to a thread register + // One thread reduces MAX_NUM_ELEMENTS_PER_THREAD elements to a thread register // in one iteration. - TOut value = 0; - for (int id = tid_in_grid; id < size; id += NUM_ELEMENTS_PER_THREAD * num_threads_in_grid) { - TOut v[NUM_ELEMENTS_PER_THREAD]; + TBuf value = 0; + for (int id = tid_in_grid_row; id < num_elements; id += MAX_NUM_ELEMENTS_PER_THREAD * num_threads_in_grid_row) { + TBuf v[MAX_NUM_ELEMENTS_PER_THREAD]; - #pragma unroll - for (int i = 0; i < NUM_ELEMENTS_PER_THREAD; i++) { - int offset = id + i * num_threads_in_grid; - if (offset < size) { - v[i] = TOut(TOp()(data[offset])); +#pragma unroll + for (int i = 0; i < MAX_NUM_ELEMENTS_PER_THREAD; i++) { + const int offset = id + i * num_threads_in_grid_row; + if (offset < num_elements) { + v[i] = TOp()(TBuf(input[offset])); } else { - v[i] = TOut(0.0f); + v[i] = TBuf(0); } } - #pragma unroll - for (int i = 0; i < NUM_ELEMENTS_PER_THREAD; i++) { +#pragma unroll + for (int i = 0; i < MAX_NUM_ELEMENTS_PER_THREAD; i++) { value += v[i]; } } @@ -95,31 +167,30 @@ __global__ void reduce_all_kernel(const int size, const TIn * data, TOut* output // Warp-level reduction (storage change: register -> register). // The values in a warp will be summed up to a scalar. After warp-level // reduction, each block holds num_warps_in_block values in the shared memory. - TOut value_ = value; #pragma unroll for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) { - value_ += WARP_SHFL_DOWN(value_, stride); + value += WARP_SHFL_DOWN(value, stride); } // Return early if only one warp is used for reduction. - // Given a fixed amount of threads, we perfer threads over warps over blocks so that we never have cases such as + // Given a fixed amount of threads, we prefer threads over warps over blocks so that we never have cases such as // 1. two blocks and each of them has only 1 warp (32 threads). // 2. two warps and each of them has only 2 threads. if (num_warps_in_block == 1) { - if (tid_in_grid == 0) { + if (tid_in_grid_row == 0) { // Compilation time if-else branch controlled by template argument can be // optimized out, so there will be no branch in real computation phase. if (DivideResultBySize) { - output[0] = TFinalOp()(value_ / TOut(size)); + output[0] = TFinalOp()(TOut(value) / TOut(num_elements)); } else { - output[0] = TFinalOp()(value_); + output[0] = TFinalOp()(TOut(value)); } } return; } if (lid_in_block == 0) { - shared_memory[wid_in_block] = value_; + shared_memory[wid_in_block] = value; } __syncthreads(); @@ -129,10 +200,10 @@ __global__ void reduce_all_kernel(const int size, const TIn * data, TOut* output // Note that the values are stored in the shared memory. // Here we assume that the size of shared_memory is smaller // than num_warps_in_block, so we just keep halving the number - // of threads in each iteartion. Our assumption is always true because + // of threads in each iteration. Our assumption is always true because // the size of shared_memory equals to the number of warps. #pragma unroll - for (int stride = NUM_WARPS_PER_BLOCK / 2; stride > 0; stride /= 2) { + for (int stride = MAX_NUM_WARPS_PER_BLOCK / 2; stride > 0; stride /= 2) { if (tid_in_block + stride < num_warps_in_block) { shared_memory[tid_in_block] += shared_memory[tid_in_block + stride]; } @@ -140,47 +211,46 @@ __global__ void reduce_all_kernel(const int size, const TIn * data, TOut* output } // Return early if only one block is used for reduction. - if (num_blocks_in_grid == 1) { - if (tid_in_grid == 0) { + if (num_blocks_in_grid_row == 1) { + if (tid_in_grid_row == 0) { // Compilation time if-else branch controlled by template argument can be // optimized out, so there will be no branch in real computation phase. if (DivideResultBySize) { - output[0] = TFinalOp()(shared_memory[0] / TOut(size)); + output[0] = TFinalOp()(TOut(shared_memory[0]) / TOut(num_elements)); } else { - output[0] = TFinalOp()(shared_memory[0]); + output[0] = TFinalOp()(TOut(shared_memory[0])); } } return; } if (tid_in_block == 0) { - buffer[bid_in_grid] = shared_memory[0]; + block_reductions_buffer[bid_in_grid_row] = shared_memory[0]; } __threadfence(); __syncthreads(); - // Grid-level reduciton. We use the last block to sum up values - // stored in the global buffer. + // Grid-level reduction. We use the last block to sum up values + // stored in the global block_reductions_buffer. __shared__ bool is_last_block_done; if (tid_in_block == 0) { - int* p_lock = reinterpret_cast(buffer + num_blocks_in_grid); - int count = atomicAdd(p_lock, 1); - is_last_block_done = (count == (num_blocks_in_grid - 1)); + const int count = atomicAdd(block_done_count_buffer, 1); + is_last_block_done = (count == (num_blocks_in_grid_row - 1)); } // All threads in each block see if they belong the last active block // (i.e., the value of is_last_block_done). __syncthreads(); - // Only the block which saw that count equals to num_blocks_in_grid - 1 can + // Only the block which saw that count equals to num_blocks_in_grid_row - 1 can // enter the following block. if (is_last_block_done) { - const int pow2_bound = least_pow2_bound(num_blocks_in_grid); + const int pow2_bound = least_pow2_bound(num_blocks_in_grid_row); for (int stride = pow2_bound / 2; stride > 0; stride /= 2) { - if (tid_in_block < stride && tid_in_block + stride < num_blocks_in_grid) { - buffer[tid_in_block] += buffer[tid_in_block + stride]; + if (tid_in_block < stride && tid_in_block + stride < num_blocks_in_grid_row) { + block_reductions_buffer[tid_in_block] += block_reductions_buffer[tid_in_block + stride]; } __syncthreads(); } @@ -190,107 +260,124 @@ __global__ void reduce_all_kernel(const int size, const TIn * data, TOut* output // Compilation time if-else branch controlled by template argument can be // optimized out, so there will be no branch in real computation phase. if (DivideResultBySize) { - output[0] = TFinalOp()(buffer[0] / TOut(size)); + output[0] = TFinalOp()(TOut(block_reductions_buffer[0]) / TOut(num_elements)); } else { - output[0] = TFinalOp()(buffer[0]); + output[0] = TFinalOp()(TOut(block_reductions_buffer[0])); } } } } -template -void call_reduce_all_kernel(const TIn *data, TOut *output, int size, TOut *buffer) { - const auto block_size = compute_block_size(size); - const int num_blocks = compute_grid_size(size); - const dim3 block(block_size.first, block_size.second, 1); - const dim3 grid(num_blocks, 1, 1); +template +__global__ void reduce_matrix_columns_kernel( + const int num_rows, const int num_cols, const TIn* const input, TOut* const output, + TBuf* const block_reductions_buffer, int* const block_done_counts_buffer) { + const int num_blocks_in_grid_row = gridDim.x; + const int row_id_in_grid = blockIdx.y; + const int num_grid_rows = gridDim.y; - // If more than one blocks are used, then inter-blocks reduction is needed. - if (num_blocks != 1) { - CUDA_CALL_THROW(cudaMemsetAsync(buffer + num_blocks, 0, sizeof(int))); + // one row per iteration + // row_id is int64_t to avoid int overflow in offset calculations + for (int64_t row_id = row_id_in_grid; row_id < num_rows; row_id += num_grid_rows) { + const TIn* const row_data = input + row_id * num_cols; + TOut* const row_output = output + row_id; + TBuf* const row_block_reductions_buffer = block_reductions_buffer + row_id * num_blocks_in_grid_row; + int* const row_block_done_counts_buffer = block_done_counts_buffer + row_id; + + reduce_all( + num_cols, row_data, row_output, + row_block_reductions_buffer, row_block_done_counts_buffer); + } +} + +template +Status call_reduce_matrix_columns( + const TIn* input, TOut* output, const int num_rows, const int num_cols, void* buffer, size_t buffer_size) { + ORT_ENFORCE(num_rows >= 0 && num_cols >= 0); + + using TBuf = AccumulationType_t; + + const auto grid_and_block_dims = compute_grid_and_block_dims(num_rows, num_cols); + const dim3& grid_dim = grid_and_block_dims.first; + const dim3& block_dim = grid_and_block_dims.second; + + TBuf* block_reductions_buffer; + int* block_done_counts_buffer; + ORT_RETURN_IF_ERROR(get_reduction_buffers( + num_rows, num_cols, buffer, buffer_size, + block_reductions_buffer, block_done_counts_buffer)); + + // If more than one block is used per grid row, then inter-block reduction is needed. + if (grid_dim.x > 1) { + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(block_done_counts_buffer, 0, num_rows * sizeof(int))); } - const int shared_mem_size = sizeof(TOut) * block_size.first * block_size.second / GPU_WARP_SIZE; - reduce_all_kernel<<>>(size, data, output, buffer); + const int shared_mem_size = sizeof(TBuf) * block_dim.x * block_dim.y / GPU_WARP_SIZE; + reduce_matrix_columns_kernel + <<>>( + num_rows, num_cols, input, output, block_reductions_buffer, block_done_counts_buffer); + + return Status::OK(); +} +} // namespace detail + +template +Status reduce_sum( + const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { + return detail::call_reduce_matrix_columns( + input, output, 1, size, buffer, buffer_size); } template -void reduce_sum(const TIn* data, TOut* output, int size, TOut* buffer) { - call_reduce_all_kernel, Identity, false>( - data, output, size, buffer); +Status reduce_square_sum( + const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { + return detail::call_reduce_matrix_columns( + input, output, 1, size, buffer, buffer_size); } template -void reduce_square_sum(const TIn* data, TOut* output, int size, TOut* buffer) { - call_reduce_all_kernel, Identity, false>( - data, output, size, buffer); +Status reduce_l2_norm( + const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { + return detail::call_reduce_matrix_columns( + input, output, 1, size, buffer, buffer_size); } template -void reduce_l2_norm(const TIn* data, TOut* output, int size, TOut* buffer) { - call_reduce_all_kernel, Sqrt, false>( - data, output, size, buffer); +Status reduce_mean( + const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { + return detail::call_reduce_matrix_columns( + input, output, 1, size, buffer, buffer_size); } -template -void reduce_mean(const TIn* data, TOut* output, int size, TOut* buffer) { - call_reduce_all_kernel, Identity, true>( - data, output, size, buffer); -} +#define INSTANTIATE_REDUCE_SUM(TIn, TOut) \ + template Status reduce_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) +INSTANTIATE_REDUCE_SUM(half, float); +INSTANTIATE_REDUCE_SUM(float, float); +INSTANTIATE_REDUCE_SUM(double, double); +#undef INSTANTIATE_REDUCE_SUM -template void reduce_sum( - const half* data, float* output, int size, float* buffer); -template void reduce_sum( - const float* data, float* output, int size, float* buffer); -template void reduce_sum( - const double* data, double* output, int size, double* buffer); +#define INSTANTIATE_REDUCE_SQUARE_SUM(TIn, TOut) \ + template Status reduce_square_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) +INSTANTIATE_REDUCE_SQUARE_SUM(half, float); +INSTANTIATE_REDUCE_SQUARE_SUM(float, float); +INSTANTIATE_REDUCE_SQUARE_SUM(double, double); +#undef INSTANTIATE_REDUCE_SQUARE_SUM -template void reduce_square_sum( - const half* data, float* output, int size, float* buffer); -template void reduce_square_sum( - const float* data, float* output, int size, float* buffer); -template void reduce_square_sum( - const double* data, double* output, int size, double* buffer); +#define INSTANTIATE_REDUCE_L2_NORM(TIn, TOut) \ + template Status reduce_l2_norm(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) +INSTANTIATE_REDUCE_L2_NORM(half, float); +INSTANTIATE_REDUCE_L2_NORM(float, float); +INSTANTIATE_REDUCE_L2_NORM(double, double); +#undef INSTANTIATE_REDUCE_L2_NORM -template void reduce_l2_norm( - const half* data, float* output, int size, float* buffer); -template void reduce_l2_norm( - const float* data, float* output, int size, float* buffer); -template void reduce_l2_norm( - const double* data, double* output, int size, double* buffer); - -template void reduce_mean( - const half* data, float* output, int size, float* buffer); -template void reduce_mean( - const float* data, float* output, int size, float* buffer); -template void reduce_mean( - const double* data, double* output, int size, double* buffer); - -bool is_matrix_row_reduction( - const cudnnReduceTensorOp_t cudnn_reduce_op, - const int m, - const int n, - const size_t rank, - std::vector axes) { - if (m < 1) - return false; - - if (n < 1) - return false; - - if (rank < 2) - return false; - - if (cudnn_reduce_op != CUDNN_REDUCE_TENSOR_ADD) - return false; - - //empty axes, default reduction - if (axes.size() < 1) - return false; - - return true; -} +#define INSTANTIATE_REDUCE_MEAN(TIn, TOut) \ + template Status reduce_mean(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) +INSTANTIATE_REDUCE_MEAN(half, float); +INSTANTIATE_REDUCE_MEAN(float, float); +INSTANTIATE_REDUCE_MEAN(double, double); +#undef INSTANTIATE_REDUCE_MEAN +namespace detail { template __global__ void reduce_matrix_rows_kernel(const TIn* input, TOut* output, int m, int n) { constexpr int x_load_count_per_thread = 1; @@ -304,8 +391,8 @@ __global__ void reduce_matrix_rows_kernel(const TIn* input, TOut* output, int m, const int tid_in_block = threadIdx.x + blockDim.x * threadIdx.y; // Shape is blockDim.y-by-blockDim.x and element type is TBuf. - extern __shared__ unsigned char shared_memory_[]; - TBuf* shared_memory = reinterpret_cast(shared_memory_); + extern __shared__ unsigned char shared_memory_bytes[]; + TBuf* shared_memory = reinterpret_cast(shared_memory_bytes); // to prevent int overflow in index calculation for input size m*n const int64_t n_int64 = static_cast(n); @@ -348,11 +435,14 @@ __global__ void reduce_matrix_rows_kernel(const TIn* input, TOut* output, int m, } } -// This function reduces the given input tensor along all but the last axis. -// For example, [N, C, H, W]-tensor may lead to a output [W]-tensor. -// It's implementation is in reduction_ops.cu and called in reduction_ops.cc. template -void call_reduce_matrix_rows(const TIn* input, TOut* output, int m, int n) { +Status call_reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool reset_initial_output) { + ORT_ENFORCE(m >= 0 && n >= 0); + + if (reset_initial_output) { + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output, 0, n * sizeof(TOut))); + } + constexpr int max_num_threads_in_block = 512; constexpr int max_num_blocks_in_grid = 512; constexpr int load_count_per_thread = 4; @@ -367,22 +457,36 @@ void call_reduce_matrix_rows(const TIn* input, TOut* output, int m, int n) { reduce_matrix_rows_kernel<<>>( input, output, m, n); + + return Status::OK(); } +} // namespace detail template -void reduce_matrix_rows(const TIn* data, TOut* output, int m, int n) { - call_reduce_matrix_rows(data, output, m, n); +Status reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool reset_initial_output) { + using TBuf = AccumulationType_t; + return detail::call_reduce_matrix_rows(input, output, m, n, reset_initial_output); } -template <> -void reduce_matrix_rows(const half* data, half* output, int m, int n) { - call_reduce_matrix_rows(data, output, m, n); +#define INSTANTIATE_REDUCE_MATRIX_ROWS(T) \ + template Status reduce_matrix_rows(const T* input, T* output, int m, int n, bool reset_initial_output) +INSTANTIATE_REDUCE_MATRIX_ROWS(half); +INSTANTIATE_REDUCE_MATRIX_ROWS(float); +INSTANTIATE_REDUCE_MATRIX_ROWS(double); +#undef INSTANTIATE_REDUCE_MATRIX_ROWS + +template +Status reduce_matrix_columns(const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size) { + return detail::call_reduce_matrix_columns( + input, output, m, n, buffer, buffer_size); } -template void reduce_matrix_rows( - const float* data, float* output, int m, int n); -template void reduce_matrix_rows( - const double* data, double* output, int m, int n); +#define INSTANTIATE_REDUCE_MATRIX_COLUMNS(T) \ + template Status reduce_matrix_columns(const T* input, T* output, int m, int n, void* buffer, size_t buffer_size) +INSTANTIATE_REDUCE_MATRIX_COLUMNS(half); +INSTANTIATE_REDUCE_MATRIX_COLUMNS(float); +INSTANTIATE_REDUCE_MATRIX_COLUMNS(double); +#undef INSTANTIATE_REDUCE_MATRIX_COLUMNS } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.h b/onnxruntime/core/providers/cuda/reduction/reduction_functions.h index c16197664c..8d5da0381e 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.h +++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.h @@ -2,35 +2,106 @@ // Licensed under the MIT License. #pragma once + #include "core/providers/cuda/cuda_common.h" +#include "core/providers/cuda/shared_inc/accumulation_type.h" namespace onnxruntime { namespace cuda { -int compute_reduction_buffer_size(int element_size, int size); +namespace detail { +size_t compute_reduce_matrix_columns_intermediate_buffer_size( + int element_size, int num_rows, int num_cols); +} +/** + * Computes the size in bytes of the intermediate buffer needed by reduce_matrix_columns(). + * @tparam TIn The input data type. + * @param m The number of matrix rows. + * @param n The number of matrix columns. + * @return The size of the intermediate buffer. + */ +template +size_t compute_reduce_matrix_columns_buffer_size(int m, int n) { + using TBuf = AccumulationType_t; + return detail::compute_reduce_matrix_columns_intermediate_buffer_size( + sizeof(TBuf), m, n); +} + +/** + * Computes the size in bytes of the intermediate buffer needed by the reduce_x() functions. + * @tparam TIn The input data type. + * @param size The number of elements. + * @return The size of the intermediate buffer. + */ +template +size_t compute_reduction_buffer_size(int size) { + using TBuf = AccumulationType_t; + return detail::compute_reduce_matrix_columns_intermediate_buffer_size( + sizeof(TBuf), 1, size); +} + +/** Computes the sum of the given elements. */ template -void reduce_sum(const TIn* input, TOut* output, int size, TOut* buffer); +Status reduce_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); +/** Computes the sum of the squares of the given elements. */ template -void reduce_square_sum(const TIn* input, TOut* output, int size, TOut* buffer); +Status reduce_square_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); +/** Computes the L2 norm of the given elements. */ template -void reduce_l2_norm(const TIn* input, TOut* output, int size, TOut* buffer); +Status reduce_l2_norm(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); +/** Computes the mean of the given elements. */ template -void reduce_mean(const TIn* data, TOut* output, int size, TOut* buffer); +Status reduce_mean(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); -// Determine if a CUDNN reduction can be computed by reduce_matrix_rows. -bool is_matrix_row_reduction( +enum class ApplicableMatrixReduction { + // can use reduce_matrix_rows() + Rows, + // can use reduce_matrix_columns() + Columns, + // no optimized matrix reduction function applies + None, +}; + +/** + * Determines whether a cuDNN reduction can be computed by an optimized matrix reduction function. + * @param cudnn_reduce_op The cuDNN reduction op type. + * @param dims The input dimensions. + * @param axes The reduction axes. + * @param[out] m If matrix reduction is possible, the number of matrix rows to use. + * @param[out] n If matrix reduction is possible, the number of matrix columns to use. + * @return The type of matrix reduction that can be done. + */ +ApplicableMatrixReduction get_applicable_matrix_reduction( const cudnnReduceTensorOp_t cudnn_reduce_op, - const int m, - const int n, - const size_t rank, - std::vector axes); + const std::vector& dims, const std::vector& axes, + int& m, int& n); +/** + * Reduces the rows in a row-major matrix to a single row containing the sum of each column. + * @param input The input data. + * @param output The output data. + * @param m The number of matrix rows. + * @param n The number of matrix columns. + * @param reset_initial_output Whether to reset (i.e., zero) the output values first. + */ template -void reduce_matrix_rows(const TIn* data, TOut* output, int m, int n); +Status reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool reset_initial_output = true); + +/** + * Reduces the columns in a row-major matrix to a single column containing the sum of each row. + * @param input The input data. + * @param output The output data. + * @param m The number of matrix rows. + * @param n The number of matrix columns. + * @param buffer The intermediate buffer. + * @param buffer_size The size of the intermediate buffer in bytes. + */ +template +Status reduce_matrix_columns(const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size); } // namespace cuda -} // namespace onnxruntime \ No newline at end of file +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc index 47fc6026a4..bbcad7d666 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc @@ -1,14 +1,16 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "reduction_ops.h" +#include "core/providers/cuda/reduction/reduction_ops.h" + +#include "core/framework/data_types_internal.h" +#include "core/framework/op_kernel_context_internal.h" #include "core/providers/common.h" +#include "core/providers/cpu/tensor/utils.h" #include "core/providers/cuda/cudnn_common.h" -#include "core/providers/cuda/math/unary_elementwise_ops_impl.h" #include "core/providers/cuda/math/binary_elementwise_ops_impl.h" #include "core/providers/cuda/math/binary_elementwise_ops.h" -#include "core/providers/cpu/tensor/utils.h" -#include "core/framework/op_kernel_context_internal.h" +#include "core/providers/cuda/math/unary_elementwise_ops_impl.h" using namespace onnxruntime::common; namespace onnxruntime { @@ -27,7 +29,7 @@ namespace cuda { ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ name, \ kOnnxDomain, \ - 11, 12, \ + 11, 12, \ T, \ kCudaExecutionProvider, \ KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ @@ -135,22 +137,27 @@ Status ReduceKernel::ReduceKernelShared( cudnnReduceTensorOp_t cudnn_reduce_op, std::vector& output_dims) const { typedef typename ToCudaType::MappedType CudaT; + typedef typename ToCudaType::MappedType CudaOutT; cudnnDataType_t cudnn_type_X = CudnnTensor::GetDataType(); const auto rank = input_shape.NumDimensions(); - // Block of fast matrix row reduction. - const auto stride = input_shape[input_shape.NumDimensions() - 1]; - const auto reduction_size = input_shape.Size() / stride; - if (fast_reduction_ && reduction_size <= std::numeric_limits::max() && stride <= std::numeric_limits::max() && - is_matrix_row_reduction(cudnn_reduce_op, - static_cast(reduction_size), - static_cast(stride), rank, axes_)) { - reduce_matrix_rows( - reinterpret_cast(X), - reinterpret_cast(Y), - static_cast(reduction_size), - static_cast(stride)); - return Status::OK(); + // Block of fast matrix reduction. + if (fast_reduction_) { + int m{}, n{}; + const auto applicable_matrix_reduction = get_applicable_matrix_reduction( + cudnn_reduce_op, input_shape.GetDims(), axes_, m, n); + switch (applicable_matrix_reduction) { + case ApplicableMatrixReduction::Rows: { + return reduce_matrix_rows( + reinterpret_cast(X), + reinterpret_cast(Y), + m, n, false); + } + case ApplicableMatrixReduction::Columns: + // don't call reduce_matrix_columns() since it will reset initial output data + default: + break; + } } const auto& input_dims = input_shape.GetDims(); @@ -335,11 +342,8 @@ Status PrepareForReduce(const Tensor* X, ORT_ENFORCE(nullptr != X); const TensorShape& input_shape = input_shape_override ? *input_shape_override : X->Shape(); - int64_t rank = static_cast(input_shape.NumDimensions()); - prepare_reduce_metadata.rank = rank; + const int64_t rank = gsl::narrow(input_shape.NumDimensions()); prepare_reduce_metadata.input_count = input_shape.Size(); - prepare_reduce_metadata.stride = (rank > 0) ? input_shape[input_shape.NumDimensions() - 1] : 1; - prepare_reduce_metadata.contiguous_axes = false; if (rank > 8) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "cuDNN only supports up to 8-D tensors in reduction"); @@ -349,36 +353,15 @@ Status PrepareForReduce(const Tensor* X, std::vector reduced(rank, false); prepare_reduce_metadata.output_dims.reserve(input_dims.size()); if (axes.size() > 0) { - int64_t reduced_axis; - std::vector reduced_axes(axes.size()); prepare_reduce_metadata.output_dims = input_dims; - for (size_t i = 0; i < axes.size(); i++) { - reduced_axis = axes[i]; - const int64_t axis = HandleNegativeAxis(reduced_axis, rank); + for (auto axis : axes) { + axis = HandleNegativeAxis(axis, rank); ORT_ENFORCE(input_dims[axis] != 0, "Can't reduce on dim with value of 0 if 'keepdims' is false. " "Invalid output shape would be produced. input_shape:", input_shape); prepare_reduce_metadata.output_dims[axis] = 1; reduced[axis] = true; - reduced_axes[i] = axis; - } - - bool contiguous_axes = true; - std::sort(reduced_axes.begin(), reduced_axes.end()); - for (size_t i = 0; i < reduced_axes.size(); i++) { - if (reduced_axes[i] != i) { - contiguous_axes = false; - break; - } - } - int64_t stride = 1; - if (contiguous_axes) { - for (size_t s = rank - 1; s >= reduced_axes.size(); s--) { - stride *= input_dims[s]; - } - prepare_reduce_metadata.stride = stride; - prepare_reduce_metadata.contiguous_axes = true; } } else { // no axes provided (i.e.) default axes => reduce on all dims @@ -416,10 +399,6 @@ Status PrepareForReduce(const Tensor* X, prepare_reduce_metadata.output_count = TensorShape(prepare_reduce_metadata.output_dims).Size(); - if (prepare_reduce_metadata.rank == 0) { - prepare_reduce_metadata.rank = 1; - } - return Status::OK(); } @@ -438,8 +417,6 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr std::vector& output_dims = prepare_reduce_metadata.output_dims; std::vector& input_dims_cudnn = prepare_reduce_metadata.input_dims_cudnn; std::vector& output_dims_cudnn = prepare_reduce_metadata.output_dims_cudnn; - int64_t rank = prepare_reduce_metadata.rank; - int64_t stride = prepare_reduce_metadata.stride; // special case when there is a dim value of 0 in the shape. if (input_count == 0) { @@ -447,6 +424,31 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr return Status::OK(); } + // Block of fast matrix reduction. + if (fast_reduction) { + int m{}, n{}; + const auto applicable_matrix_reduction = get_applicable_matrix_reduction( + cudnn_reduce_op, input_shape.GetDims(), axes, m, n); + switch (applicable_matrix_reduction) { + case ApplicableMatrixReduction::Rows: { + return reduce_matrix_rows( + reinterpret_cast(input.template Data()), + reinterpret_cast(output.template MutableData()), + m, n); + } + case ApplicableMatrixReduction::Columns: { + const auto buffer_size_bytes = compute_reduce_matrix_columns_buffer_size(m, n); + auto buffer = cuda_ep.GetScratchBuffer(buffer_size_bytes); + return reduce_matrix_columns( + reinterpret_cast(input.template Data()), + reinterpret_cast(output.template MutableData()), + m, n, buffer.get(), buffer_size_bytes); + } + default: + break; + } + } + // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. // Therefore zeroing out the memory is required CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output.MutableDataRaw(), 0, output.SizeInBytes())); @@ -454,22 +456,6 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr IAllocatorUniquePtr temp_X; cudnnDataType_t cudnn_type_X = CudnnTensor::GetDataType(); - // Block of fast matrix row reduction. - // It relies on new atomicAdd for half type, so old CUDA can't use it. - const auto reduction_size = input_count / stride; - if (!std::is_same::value && !std::is_same::value) { - if (fast_reduction && reduction_size <= std::numeric_limits::max() && stride <= std::numeric_limits::max() && - prepare_reduce_metadata.contiguous_axes && - is_matrix_row_reduction(cudnn_reduce_op, static_cast(reduction_size), static_cast(stride), rank, axes)) { - reduce_matrix_rows( - reinterpret_cast(input.template Data()), - reinterpret_cast(output.template MutableData()), - static_cast(reduction_size), - static_cast(stride)); - return Status::OK(); - } - } - if (ReduceTensorIndices == CUDNN_REDUCE_TENSOR_FLATTENED_INDICES && std::is_same::value) { // ArgMax/ArgMin with FP16 are not supported by cudnn, so convert input to fp32 then call cudnn temp_X = cuda_ep.GetScratchBuffer(input_count); @@ -677,7 +663,7 @@ Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, cudnnRe Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims); bool fast_reduction = fast_reduction_; if (fast_reduction) { - auto ctx_internal = static_cast(ctx); + auto ctx_internal = dynamic_cast(ctx); if (ctx_internal && ctx_internal->GetUseDeterministicCompute()) fast_reduction = false; } diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.h b/onnxruntime/core/providers/cuda/reduction/reduction_ops.h index 8eb404b74a..5cdfdf8dff 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.h +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.h @@ -33,9 +33,6 @@ struct PrepareReduceMetadata { std::vector squeezed_output_dims; std::vector input_dims_cudnn; std::vector output_dims_cudnn; - int64_t rank; - int64_t stride; - bool contiguous_axes; }; template diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_utils.cuh b/onnxruntime/core/providers/cuda/reduction/reduction_utils.cuh index e30d6825f2..810d931dd8 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_utils.cuh +++ b/onnxruntime/core/providers/cuda/reduction/reduction_utils.cuh @@ -18,51 +18,26 @@ __forceinline__ __host__ __device__ int least_pow2_bound(int value) { return static_cast(++value_); } -template -struct Cast { - __forceinline__ __device__ TAccumulated operator()(const TValue& value) { - return TAccumulated(value); - } -}; - -template struct Square { - __forceinline__ __device__ TAccumulated operator()(const TValue& value) { - return TAccumulated(value) * TAccumulated(value); + template + __forceinline__ __device__ T operator()(const T& value) { + return value * value; } }; -template -struct Abs { - __forceinline__ __device__ TAccumulated operator()(const TValue& value) { - TAccumulated value_ = TAccumulated(value); - return value_ > TAccumulated(0) ? value_ : -value_; - } -}; - -template struct Sqrt { + template __forceinline__ __device__ T operator()(const T& value) { return _Sqrt(value); } }; -template struct Identity { + template __forceinline__ __device__ T operator()(const T& value) { return value; } }; -template -struct ToBuffer { - typedef T Type; -}; - -template <> -struct ToBuffer { - typedef float Type; -}; - } // namespace cuda -} // namespace onnxruntime \ No newline at end of file +} // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index da337c0db1..fd4f82e036 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -8,9 +8,6 @@ #include "test/common/tensor_op_test_utils.h" #include "test/providers/provider_test_utils.h" #include "test/providers/cpu/reduction/reduction_test_cases.h" -#ifdef USE_CUDA -#include "core/providers/cuda/reduction/reduction_functions.h" -#endif namespace onnxruntime { namespace test { @@ -1944,80 +1941,6 @@ TEST(ReductionOpTest, ArgMin_int32_select_last) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kNGraphExecutionProvider}); } -#ifdef USE_CUDA - -void test_reduce_apis(int64_t size, float relative_error_tolerance = 1e-4f) { - float output_sum = 0; - float output_square_sum = 0; - float output_mean = 0; - float expected_output_sum = 0; - float expected_output_square_sum = 0; - float expected_output_mean = 0; - const std::vector shape = {size}; - RandomValueGenerator random_value_generator{}; - const auto input = random_value_generator.Uniform(shape, 0.1f, 1.0f); - for (const auto input_value : input) { - expected_output_sum += input_value; - expected_output_square_sum += input_value * input_value; - expected_output_mean += input_value / float(size); - } - const int buffer_size_in_byte = onnxruntime::cuda::compute_reduction_buffer_size( - static_cast(sizeof(float)), static_cast(size)); - - float* device_input = NULL; - float* device_output_sum = NULL; - float* device_output_square_sum = NULL; - float* device_output_mean = NULL; - float* buffer = NULL; - - cudaMalloc((void**)&device_input, size * sizeof(float)); - cudaMalloc((void**)&device_output_sum, 1 * sizeof(float)); - cudaMalloc((void**)&device_output_square_sum, 1 * sizeof(float)); - cudaMalloc((void**)&device_output_mean, 1 * sizeof(float)); - cudaMalloc((void**)&buffer, buffer_size_in_byte); - - cudaMemcpy(device_input, input.data(), size * sizeof(float), cudaMemcpyHostToDevice); - - onnxruntime::cuda::reduce_sum(device_input, - device_output_sum, - static_cast(size), - buffer); - onnxruntime::cuda::reduce_square_sum(device_input, - device_output_square_sum, - static_cast(size), buffer); - onnxruntime::cuda::reduce_mean( - device_input, - device_output_mean, - static_cast(size), - buffer); - - cudaMemcpy(&output_sum, device_output_sum, 1 * sizeof(float), cudaMemcpyDeviceToHost); - cudaMemcpy(&output_square_sum, device_output_square_sum, 1 * sizeof(float), cudaMemcpyDeviceToHost); - cudaMemcpy(&output_mean, device_output_mean, 1 * sizeof(float), cudaMemcpyDeviceToHost); - - cudaFree(device_input); - cudaFree(buffer); - cudaFree(device_output_sum); - cudaFree(device_output_square_sum); - cudaFree(device_output_mean); - - EXPECT_LT(std::abs(output_sum - expected_output_sum) / expected_output_sum, relative_error_tolerance); - EXPECT_LT(std::abs(output_square_sum - expected_output_square_sum) / expected_output_square_sum, - relative_error_tolerance); - EXPECT_LT(std::abs(output_mean - expected_output_mean) / expected_output_mean, relative_error_tolerance); -} - -TEST(ReduceApiTest, Sum) { - test_reduce_apis(3); - test_reduce_apis(19); - test_reduce_apis(123); - test_reduce_apis(1128); - test_reduce_apis(5566); - test_reduce_apis(941736, 2e-4f); -} - -#endif - TEST(ReductionOpTest, ArgMin_int32_neg_axis) { OpTester test("ArgMin"); test.AddAttribute("axis", (int64_t)(-3)); diff --git a/onnxruntime/test/providers/cuda/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/reduction_functions_test.cc new file mode 100644 index 0000000000..f504cf1161 --- /dev/null +++ b/onnxruntime/test/providers/cuda/reduction_functions_test.cc @@ -0,0 +1,308 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#ifdef USE_CUDA + +#include + +#include "gtest/gtest.h" + +#include "core/providers/cuda/reduction/reduction_functions.h" +#include "test/common/tensor_op_test_utils.h" +#include "test/util/include/asserts.h" + +namespace onnxruntime { +namespace test { + +namespace { +struct DeviceMemoryDeleter { + template + void operator()(T* p) { + cudaFree(p); + } +}; + +template +std::unique_ptr AllocateDeviceMemory(size_t n = 1) { + T* p{}; + cudaMalloc(&p, n * sizeof(T)); + return std::unique_ptr(p); +} + +template +void CheckDeviceValues(size_t n, const T* d_actual, const T* expected, float relative_error_tolerance) { + std::vector actual(n); + cudaMemcpy(actual.data(), d_actual, n * sizeof(T), cudaMemcpyDeviceToHost); + + for (size_t i = 0; i < n; ++i) { + EXPECT_LE(std::abs(actual[i] - expected[i]) / expected[i], relative_error_tolerance) + << "i: " << i << ", actual[i]: " << actual[i] << ", expected[i]: " << expected[i]; + } +} + +void TestReduceRowToScalarApis(int size, float relative_error_tolerance = 1e-4f) { + SCOPED_TRACE(MakeString("size: ", size)); + + float expected_output_sum = 0; + float expected_output_square_sum = 0; + float expected_output_mean = 0; + const std::vector shape = {size}; + RandomValueGenerator random_value_generator{}; + const auto input = random_value_generator.Uniform(shape, 0.1f, 1.0f); + for (const auto input_value : input) { + expected_output_sum += input_value; + expected_output_square_sum += input_value * input_value; + expected_output_mean += input_value / float(size); + } + const auto buffer_size_in_bytes = + cuda::compute_reduction_buffer_size(size); + + auto device_input = AllocateDeviceMemory(size); + auto device_output_sum = AllocateDeviceMemory(); + auto device_output_square_sum = AllocateDeviceMemory(); + auto device_output_mean = AllocateDeviceMemory(); + auto buffer = AllocateDeviceMemory(buffer_size_in_bytes); + + cudaMemcpy(device_input.get(), input.data(), size * sizeof(float), cudaMemcpyHostToDevice); + + ASSERT_STATUS_OK(cuda::reduce_sum( + device_input.get(), + device_output_sum.get(), + size, + buffer.get(), + buffer_size_in_bytes)); + ASSERT_STATUS_OK(cuda::reduce_square_sum( + device_input.get(), + device_output_square_sum.get(), + size, + buffer.get(), + buffer_size_in_bytes)); + ASSERT_STATUS_OK(cuda::reduce_mean( + device_input.get(), + device_output_mean.get(), + size, + buffer.get(), + buffer_size_in_bytes)); + + ASSERT_TRUE(CUDA_CALL(cudaDeviceSynchronize())); + + CheckDeviceValues(1, device_output_sum.get(), &expected_output_sum, relative_error_tolerance); + CheckDeviceValues(1, device_output_square_sum.get(), &expected_output_square_sum, relative_error_tolerance); + CheckDeviceValues(1, device_output_mean.get(), &expected_output_mean, relative_error_tolerance); +} + +void TestReduceRowsToRow(int m, int n, bool reset_initial_output, float relative_error_tolerance = 1e-4f) { + SCOPED_TRACE(MakeString("m: ", m, ", n:", n, ", reset_initial_output: ", reset_initial_output)); + + const TensorShape shape{m, n}; + RandomValueGenerator random{}; + const auto values = random.Uniform(shape.GetDims(), 1.0f, 10.0f); + const auto initial_value = reset_initial_output ? 0.0f : 5.0f; + const std::vector expected_row = + [m, n, &values, initial_value]() { + std::vector row(n, initial_value); + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + row[j] += values[i * n + j]; + } + } + return row; + }(); + + auto d_in = AllocateDeviceMemory(m * n); + auto d_out = AllocateDeviceMemory(n); + + cudaMemcpy(d_in.get(), values.data(), m * n * sizeof(float), cudaMemcpyHostToDevice); + + if (!reset_initial_output) { + // manually initialize output data + cuda::Fill(d_out.get(), initial_value, n); + } + + ASSERT_STATUS_OK(cuda::reduce_matrix_rows( + d_in.get(), d_out.get(), + m, n, + reset_initial_output)); + + ASSERT_TRUE(CUDA_CALL(cudaDeviceSynchronize())); + + CheckDeviceValues(n, d_out.get(), expected_row.data(), relative_error_tolerance); +} + +template +std::vector ExpectedReduceMatrixColumnsOutput( + int m, int n, const std::vector& values) { + std::vector column(m); + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + column[i] += values[i * n + j]; + } + } + return column; +} + +void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e-4f) { + SCOPED_TRACE(MakeString("m: ", m, ", n:", n)); + + const TensorShape shape{m, n}; + RandomValueGenerator random{}; + const auto values = random.Uniform(shape.GetDims(), 1.0f, 10.0f); + const auto expected_column = ExpectedReduceMatrixColumnsOutput(m, n, values); + + auto d_in = AllocateDeviceMemory(m * n); + auto d_out = AllocateDeviceMemory(m); + + cudaMemcpy(d_in.get(), values.data(), m * n * sizeof(float), cudaMemcpyHostToDevice); + + size_t buffer_size_in_bytes = + cuda::compute_reduce_matrix_columns_buffer_size(m, n); + auto d_buffer = AllocateDeviceMemory(buffer_size_in_bytes); + + ASSERT_STATUS_OK(cuda::reduce_matrix_columns( + d_in.get(), d_out.get(), + m, n, + d_buffer.get(), buffer_size_in_bytes)); + + ASSERT_TRUE(CUDA_CALL(cudaDeviceSynchronize())); + + CheckDeviceValues(m, d_out.get(), expected_column.data(), relative_error_tolerance); +} +} // namespace + +TEST(ReductionFunctionsTest, ReduceRowToScalar) { + TestReduceRowToScalarApis(3); + TestReduceRowToScalarApis(19); + TestReduceRowToScalarApis(123); + TestReduceRowToScalarApis(1128); + TestReduceRowToScalarApis(5566); + TestReduceRowToScalarApis(941736, 2e-4f); +} + +TEST(ReductionFunctionsTest, ReduceRowsToRow) { + for (int m : {3, 193, 2945}) { + for (int n : {3, 193, 2945}) { + TestReduceRowsToRow(m, n, true); + TestReduceRowsToRow(m, n, false); + } + } +} + +TEST(ReductionFunctionsTest, ReduceColumnsToColumn) { + for (int m : {3, 193, 2945}) { + for (int n : {3, 193, 2945}) { + TestReduceColumnsToColumn(m, n); + } + } +} + +TEST(ReductionFunctionsTest, BufferOffsets) { + const int m = 2048; + const int n = 1024; + + const size_t max_buffer_offset = 15; + + const size_t buffer_size_in_bytes = + cuda::compute_reduce_matrix_columns_buffer_size(m, n) + max_buffer_offset; + + auto d_input = AllocateDeviceMemory(m * n); + auto d_output = AllocateDeviceMemory(m); + auto d_buffer = AllocateDeviceMemory(buffer_size_in_bytes); + + RandomValueGenerator random{}; + const float relative_error_tolerance = 1e-4f; + + for (size_t buffer_offset = 1; buffer_offset <= max_buffer_offset; ++buffer_offset) { + SCOPED_TRACE(MakeString("buffer offset: ", buffer_offset)); + + const auto input = random.Uniform({m, n}, 1.0, 10.0); + cudaMemcpy(d_input.get(), input.data(), m * n * sizeof(double), cudaMemcpyHostToDevice); + + ASSERT_STATUS_OK(cuda::reduce_matrix_columns( + d_input.get(), d_output.get(), + m, n, + d_buffer.get() + buffer_offset, + buffer_size_in_bytes - buffer_offset)); + + const auto expected_column = ExpectedReduceMatrixColumnsOutput(m, n, input); + CheckDeviceValues(m, d_output.get(), expected_column.data(), relative_error_tolerance); + } +} + +TEST(ReductionFunctionsTest, InvalidBufferSize) { + const int m = 2048; + const int n = 1024; + + // this should be too small + const size_t buffer_size_in_bytes = + cuda::compute_reduce_matrix_columns_buffer_size(m, n) / 10; + + auto d_input = AllocateDeviceMemory(m * n); + auto d_output = AllocateDeviceMemory(m); + auto d_buffer = AllocateDeviceMemory(buffer_size_in_bytes); + + RandomValueGenerator random{}; + const auto input = random.Uniform({m, n}, 1.0, 10.0); + cudaMemcpy(d_input.get(), input.data(), m * n * sizeof(float), cudaMemcpyHostToDevice); + + const auto status = + cuda::reduce_matrix_columns(d_input.get(), d_output.get(), m, n, d_buffer.get(), buffer_size_in_bytes); + ASSERT_FALSE(status.IsOK()); +} + +TEST(ReductionFunctionsTest, GetApplicableMatrixReduction) { + const cudnnReduceTensorOp_t valid_op_type = CUDNN_REDUCE_TENSOR_ADD; + int m{}, n{}; + + // contiguous axes from beginning + EXPECT_EQ( + cuda::get_applicable_matrix_reduction( + valid_op_type, {2, 4, 8, 16}, {0, 1}, m, n), + cuda::ApplicableMatrixReduction::Rows); + EXPECT_EQ(m, 2 * 4); + EXPECT_EQ(n, 8 * 16); + + // contiguous axes to end + EXPECT_EQ( + cuda::get_applicable_matrix_reduction( + valid_op_type, {2, 4, 8, 16}, {1, 2, 3}, m, n), + cuda::ApplicableMatrixReduction::Columns); + EXPECT_EQ(m, 2); + EXPECT_EQ(n, 4 * 8 * 16); + + // single axis + EXPECT_EQ( + cuda::get_applicable_matrix_reduction( + valid_op_type, {2, 4, 8, 16}, {3}, m, n), + cuda::ApplicableMatrixReduction::Columns); + EXPECT_EQ(m, 2 * 4 * 8); + EXPECT_EQ(n, 16); + + // unsupported axes + EXPECT_EQ( + cuda::get_applicable_matrix_reduction( + valid_op_type, {2, 4, 8, 16}, {0, 1, 2, 3}, m, n), + cuda::ApplicableMatrixReduction::None); + EXPECT_EQ( + cuda::get_applicable_matrix_reduction( + valid_op_type, {2, 4, 8, 16}, {}, m, n), + cuda::ApplicableMatrixReduction::None); + EXPECT_EQ( + cuda::get_applicable_matrix_reduction( + valid_op_type, {2, 4, 8, 16, 32, 64}, {0, 1, 3, 4}, m, n), + cuda::ApplicableMatrixReduction::None); + EXPECT_EQ( + cuda::get_applicable_matrix_reduction( + valid_op_type, {2, 4, 8, 16}, {1, 2}, m, n), + cuda::ApplicableMatrixReduction::None); + + // invalid op type + EXPECT_EQ( + cuda::get_applicable_matrix_reduction( + CUDNN_REDUCE_TENSOR_MAX, {2, 4, 8, 16}, {0, 1}, m, n), + cuda::ApplicableMatrixReduction::None); +} + +} // namespace test +} // namespace onnxruntime + +#endif diff --git a/onnxruntime/test/util/include/test_random_seed.h b/onnxruntime/test/util/include/test_random_seed.h index 0bff920579..8a91678026 100644 --- a/onnxruntime/test/util/include/test_random_seed.h +++ b/onnxruntime/test/util/include/test_random_seed.h @@ -8,12 +8,12 @@ namespace onnxruntime { namespace test { -// These variables control the behavior of GetTestRandomSeed(). +// These environment variables control the behavior of GetTestRandomSeed(). namespace test_random_seed_env_vars { // Specifies a fixed seed value to return. // If set, this has the highest precedence. constexpr const char* kValue = "ORT_TEST_RANDOM_SEED_VALUE"; -// If set (and not using a fixed value), specifies that a new seed value is returned each time. +// If set to 1 (and not using a fixed value), specifies that a new seed value is returned each time. // The default behavior is to return the same cached seed value per process. // This is useful when repeatedly running flaky tests to reproduce errors. constexpr const char* kDoNotCache = "ORT_TEST_RANDOM_SEED_DO_NOT_CACHE"; diff --git a/orttraining/orttraining/test/training_ops/cuda/reduce_sum_test.cc b/orttraining/orttraining/test/training_ops/cuda/reduce_sum_test.cc index 9cfba8dd75..dd76668d95 100644 --- a/orttraining/orttraining/test/training_ops/cuda/reduce_sum_test.cc +++ b/orttraining/orttraining/test/training_ops/cuda/reduce_sum_test.cc @@ -24,10 +24,13 @@ static void TestReduceSum(const std::vector& X_dims, // create rand inputs RandomValueGenerator random{}; - std::vector X_data = random.Uniform(X_dims, -10.0f, 10.0f); + const bool is_positive = random.Uniform({1}, 0, 2)[0] == 0; + const float range_begin = is_positive ? 1.0f : -10.0f; + const float range_end = is_positive ? 10.0f : -1.0f; + const std::vector X_data = random.Uniform(X_dims, range_begin, range_end); test.AddInput("X", X_dims, X_data); - std::vector Y_data = FillZeros(Y_dims); + const std::vector Y_data = FillZeros(Y_dims); test.AddOutput("Y", Y_dims, Y_data); test.CompareWithCPU(kGpuExecutionProvider, per_sample_tolerance, relative_per_sample_tolerance); @@ -62,8 +65,7 @@ TEST(CudaKernelTest, ReduceSum_MidTensor) { std::vector Y_dims{3072}; std::vector axes{0, 1}; bool keepdims = false; - double per_sample_tolerance = 4e-4; - TestReduceSum(X_dims, Y_dims, axes, keepdims, per_sample_tolerance); + TestReduceSum(X_dims, Y_dims, axes, keepdims); } TEST(CudaKernelTest, ReduceSum_LargeTensor) { @@ -71,9 +73,31 @@ TEST(CudaKernelTest, ReduceSum_LargeTensor) { std::vector Y_dims{30528}; std::vector axes{0, 1}; bool keepdims = false; - double per_sample_tolerance = 5e-4; - double relative_per_sample_tolerance = 5e-2; - TestReduceSum(X_dims, Y_dims, axes, keepdims, per_sample_tolerance, relative_per_sample_tolerance); + TestReduceSum(X_dims, Y_dims, axes, keepdims); +} + +TEST(CudaKernelTest, ReduceSum_SmallTensorTrailingAxes) { + std::vector X_dims{128, 2, 128}; + std::vector Y_dims{128}; + std::vector axes{1, 2}; + bool keepdims = false; + TestReduceSum(X_dims, Y_dims, axes, keepdims); +} + +TEST(CudaKernelTest, ReduceSum_MidTensorTrailingAxes) { + std::vector X_dims{3072, 2, 512}; + std::vector Y_dims{3072}; + std::vector axes{1, 2}; + bool keepdims = false; + TestReduceSum(X_dims, Y_dims, axes, keepdims); +} + +TEST(CudaKernelTest, ReduceSum_LargeTensorTrailingAxes) { + std::vector X_dims{30528, 4, 512}; + std::vector Y_dims{30528}; + std::vector axes{1, 2}; + bool keepdims = false; + TestReduceSum(X_dims, Y_dims, axes, keepdims); } } // namespace test diff --git a/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cc b/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cc index 6770a632d5..51d2f7bfbb 100644 --- a/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cc +++ b/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cc @@ -11,16 +11,16 @@ namespace onnxruntime { namespace cuda { -#define REGISTER_KERNEL_VERSIONED_TYPED_TWO_TYPES(Class, T, Tin, domain, startver, endver) \ - ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_EX( \ - Class, \ - domain, \ - startver, endver, \ - T, Tin, \ - kCudaExecutionProvider, \ - KernelDefBuilder() \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("Tin", DataTypeImpl::GetTensorType()), \ +#define REGISTER_KERNEL_VERSIONED_TYPED_TWO_TYPES(Class, T, Tin, domain, startver, endver) \ + ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_EX( \ + Class, \ + domain, \ + startver, endver, \ + T, Tin, \ + kCudaExecutionProvider, \ + KernelDefBuilder() \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("Tin", DataTypeImpl::GetTensorType()), \ Class); #define REGISTER_KERNEL_TYPED_TWO_TYPES(Class, T, Tin, domain, version) \ @@ -113,16 +113,17 @@ Status SoftmaxCrossEntropyLoss::ComputeInternal(OpKernelContext* ctx) co auto normalize_factor_data = GetScratchBuffer(1); if (reduction_ == ReductionType::MEAN) { // Compute buffer size in byte for reduction APIs. - const auto buffer_size = static_cast( - compute_reduction_buffer_size( - static_cast(sizeof(T)), static_cast(N_D))); + const auto buffer_size = + compute_reduction_buffer_size(static_cast(N_D)); // Allocate reduction buffer whose size is buffer_size bytes. IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( buffer_size); - reduce_sum(weight_data_nd_data, - normalize_factor_data.get(), - static_cast(N_D), - reinterpret_cast(reduction_buffer.get())); + ORT_RETURN_IF_ERROR(reduce_sum( + weight_data_nd_data, + normalize_factor_data.get(), + static_cast(N_D), + reduction_buffer.get(), + buffer_size)); } else { const T normalize_factor = static_cast(1); CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice)); @@ -213,16 +214,17 @@ Status SoftmaxCrossEntropyLossGrad::ComputeInternal(OpKernelContext* ctx auto normalize_factor_data = GetScratchBuffer(1); if (reduction_ == ReductionType::MEAN) { // Compute buffer size in byte for reduction APIs. - const auto buffer_size = static_cast( - compute_reduction_buffer_size( - static_cast(sizeof(T)), static_cast(N_D))); + const auto buffer_size = + compute_reduction_buffer_size(static_cast(N_D)); // Allocate reduction buffer whose size is buffer_size bytes. IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( buffer_size); - reduce_sum(weight_data_nd_data, - normalize_factor_data.get(), - static_cast(N_D), - reinterpret_cast(reduction_buffer.get())); + ORT_RETURN_IF_ERROR(reduce_sum( + weight_data_nd_data, + normalize_factor_data.get(), + static_cast(N_D), + reduction_buffer.get(), + buffer_size)); } else { const T normalize_factor = static_cast(1); CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice)); diff --git a/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc b/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc index d2683d84cb..59ca3930bc 100644 --- a/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc +++ b/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc @@ -175,16 +175,17 @@ Status SparseSoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice); } else { // Compute buffer size in byte for reduction APIs. - const auto buffer_size = static_cast( - compute_reduction_buffer_size( - static_cast(sizeof(T)), static_cast(N))); + const auto buffer_size = + compute_reduction_buffer_size(static_cast(N)); // Allocate reduction buffer whose size is buffer_size bytes. IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( buffer_size); - reduce_sum(weight_data, - normalize_factor_data.get(), - static_cast(N), - reinterpret_cast(reduction_buffer.get())); + ORT_RETURN_IF_ERROR(reduce_sum( + weight_data, + normalize_factor_data.get(), + static_cast(N), + reduction_buffer.get(), + buffer_size)); } } @@ -251,16 +252,17 @@ Status SparseSoftmaxCrossEntropyGrad::ComputeInternal(OpKernelContext* c cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice); } else { // Compute buffer size in byte for reduction APIs. - const auto buffer_size = static_cast( - compute_reduction_buffer_size( - static_cast(sizeof(T)), static_cast(N))); + const auto buffer_size = + compute_reduction_buffer_size(static_cast(N)); // Allocate reduction buffer whose size is buffer_size bytes. IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( buffer_size); - reduce_sum(weight_data, - normalize_factor_data.get(), - static_cast(N), - reinterpret_cast(reduction_buffer.get())); + ORT_RETURN_IF_ERROR(reduce_sum( + weight_data, + normalize_factor_data.get(), + static_cast(N), + reduction_buffer.get(), + buffer_size)); } } diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc index f5d8c802e5..ecd4703336 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc +++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc @@ -46,24 +46,24 @@ std::vector> GenerateLambExtraAliasMapping() { } // TODO: Once Schema is checked in to onnx lets fix this to match that -#define REGISTER_LAMB_KERNEL_TYPED(T1, T2, T3, T4, T_GRAD_NORM, T_MIXED_PRECISION_FP) \ - ONNX_OPERATOR_TYPED_KERNEL_EX( \ - LambOptimizer, \ - kMSDomain, \ - 1, \ - T1##_##T2##_##T3##_##T4##_##T_GRAD_NORM##_##T_MIXED_PRECISION_FP, \ - kCudaExecutionProvider, \ - KernelDefBuilder() \ - .Alias(GenerateLambExtraAliasMapping()) \ - .InputMemoryType(0) /* Keep do_update in CPU */ \ - .InputMemoryType(4) /* Keep iteration_count in CPU */ \ - .OutputMemoryType(0) /* Keep iteration_count in CPU */ \ - .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("T2", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("T3", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("T4", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("T_MIXED_PRECISION_FP", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("T_GRAD_NORM", DataTypeImpl::GetTensorType()), \ +#define REGISTER_LAMB_KERNEL_TYPED(T1, T2, T3, T4, T_GRAD_NORM, T_MIXED_PRECISION_FP) \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + LambOptimizer, \ + kMSDomain, \ + 1, \ + T1##_##T2##_##T3##_##T4##_##T_GRAD_NORM##_##T_MIXED_PRECISION_FP, \ + kCudaExecutionProvider, \ + KernelDefBuilder() \ + .Alias(GenerateLambExtraAliasMapping()) \ + .InputMemoryType(0) /* Keep do_update in CPU */ \ + .InputMemoryType(4) /* Keep iteration_count in CPU */ \ + .OutputMemoryType(0) /* Keep iteration_count in CPU */ \ + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("T2", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("T3", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("T4", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("T_MIXED_PRECISION_FP", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("T_GRAD_NORM", DataTypeImpl::GetTensorType()), \ LambOptimizer); REGISTER_LAMB_KERNEL_TYPED(float, float, MLFloat16, float, MLFloat16, MLFloat16) @@ -112,7 +112,6 @@ Status copy_inputs_to_outputs( const int group_count, const int input_group_size, const int output_group_size) { - const Tensor* step_tensor = ctx->Input(4); if (step_tensor) { const int64_t* step_data = step_tensor->template Data(); @@ -204,10 +203,10 @@ Status launch_lamb_compute_direction( for (int i = 0; i < group_count; ++i) { if (tensor_sizes[i] > max_tensor_size) { // For the first iteration (indexed by 0), the update count should be 2. - const float alpha_correction = do_bias_correction ? - onnxruntime::contrib::compute_bias_correction_coefficient(alphas[i], update_count) : 1.f; - const float beta_correction = do_bias_correction ? - onnxruntime::contrib::compute_bias_correction_coefficient(betas[i], update_count) : 1.f; + const float alpha_correction = + do_bias_correction ? onnxruntime::contrib::compute_bias_correction_coefficient(alphas[i], update_count) : 1.f; + const float beta_correction = + do_bias_correction ? onnxruntime::contrib::compute_bias_correction_coefficient(betas[i], update_count) : 1.f; LambComputeDirection( p_ws[i], @@ -248,9 +247,9 @@ Status launch_lamb_compute_direction( // For the first iteration (indexed by 0), the update count should be 1. const float alpha_correction = - do_bias_correction ? onnxruntime::contrib::compute_bias_correction_coefficient(alpha, update_count) : 1.f; + do_bias_correction ? onnxruntime::contrib::compute_bias_correction_coefficient(alpha, update_count) : 1.f; const float beta_correction = - do_bias_correction ? onnxruntime::contrib::compute_bias_correction_coefficient(beta, update_count) : 1.f; + do_bias_correction ? onnxruntime::contrib::compute_bias_correction_coefficient(beta, update_count) : 1.f; typedef LambMultiTensorComputeDirectionFunctor LambStage1; LambStage1 lamb_stage1; @@ -274,7 +273,8 @@ Status launch_lamb_reduction( std::vector& p_d_norms, std::vector& p_ws, std::vector& p_ds, - CudaTNorm* reduction_buffer) { + void* reduction_buffer, + size_t reduction_buffer_size) { ORT_ENFORCE(group_count == static_cast(tensor_sizes.size())); ORT_ENFORCE(group_count == static_cast(p_w_norms.size())); @@ -292,16 +292,18 @@ Status launch_lamb_reduction( const int max_tensor_size = compute_max_tensor_size_per_launch(4); for (int i = 0; i < group_count; ++i) { if (tensor_sizes[i] > max_tensor_size) { - reduce_square_sum( + ORT_RETURN_IF_ERROR(reduce_square_sum( p_ws[i], p_w_norms[i], tensor_sizes[i], - reduction_buffer); - reduce_square_sum( + reduction_buffer, + reduction_buffer_size)); + ORT_RETURN_IF_ERROR(reduce_square_sum( p_ds[i], p_d_norms[i], tensor_sizes[i], - reduction_buffer); + reduction_buffer, + reduction_buffer_size)); } else { std::vector ptrs(tensor_count_per_group); ptrs[0] = const_cast(p_ws[i]); // weight tensor @@ -406,12 +408,13 @@ Status launch_lamb_update( // Only launch multi-tensor function if we have at least one tensor in the buckets. if (tensor_sizes_in_bucket.size() > 0 && buckets.size() > 0) { typedef LambMultiTensorUpdateFunctor< - CudaT1, CudaT2, CudaT3, CudaT_MIXED_PRECISION_FP> LambStage2; + CudaT1, CudaT2, CudaT3, CudaT_MIXED_PRECISION_FP> + LambStage2; LambStage2 lamb_stage2; launch_multi_tensor_functor< - tensor_count_per_group, LambStage2, - const CudaT1*, const float, const float>( + tensor_count_per_group, LambStage2, + const CudaT1*, const float, const float>( 2048 * 32, tensor_sizes_in_bucket, buckets, @@ -540,13 +543,11 @@ Status LambOptimizer::Compute } // Allocate a buffer in byte for reduction API calls. - const auto buffer_size = static_cast( - compute_reduction_buffer_size( - static_cast(sizeof(T2)), max_tensor_size)); + const auto reduction_buffer_size = + compute_reduction_buffer_size(max_tensor_size); - // Allocate reduction buffer whose size is buffer_size bytes. - IAllocatorUniquePtr reduction_buffer = GetScratchBuffer(buffer_size); - CudaT2* reduction_data = reinterpret_cast(reduction_buffer.get()); + // Allocate reduction buffer whose size is reduction_buffer_size bytes. + IAllocatorUniquePtr reduction_buffer = GetScratchBuffer(reduction_buffer_size); // Input tensors' pointers. std::vector p_ws(group_count); @@ -626,8 +627,7 @@ Status LambOptimizer::Compute p_w_mixed_precision_news[group_index] = w_mixed_precision_new != nullptr ? reinterpret_cast(w_mixed_precision_new->template MutableData()) : nullptr; } - - launch_lamb_compute_direction( + ORT_RETURN_IF_ERROR(launch_lamb_compute_direction( step_data ? *step_data : 0, group_count, loss_scale_data, @@ -637,18 +637,19 @@ Status LambOptimizer::Compute p_ds, p_m1_news, p_m2_news, alpha_, beta_, lambda_, epsilon_, - do_bias_correction_); + do_bias_correction_)); - launch_lamb_reduction( + ORT_RETURN_IF_ERROR(launch_lamb_reduction( group_count, tensor_sizes, p_w_norms, p_d_norms, p_ws, p_ds, - reduction_data); + reduction_buffer.get(), + reduction_buffer_size)); - launch_lamb_update( + ORT_RETURN_IF_ERROR(launch_lamb_update( group_count, eta_data, ratio_min_, @@ -660,7 +661,7 @@ Status LambOptimizer::Compute p_ds, p_w_news, p_g_news, - p_w_mixed_precision_news); + p_w_mixed_precision_news)); if (step_tensor) { Tensor* step_tensor_new = ctx->Output(0, step_tensor->Shape()); diff --git a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cc b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cc index 21520db123..c497424ea3 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cc +++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cc @@ -46,7 +46,7 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { CudaTOut* p_output = reinterpret_cast(output->template MutableData()); CUDA_RETURN_IF_ERROR(cudaMemsetAsync(p_output, 0, sizeof(CudaTOut))); - auto ctx_internal = static_cast(ctx); + auto ctx_internal = dynamic_cast(ctx); bool deterministic = ctx_internal && ctx_internal->GetUseDeterministicCompute(); if (!deterministic) { @@ -65,32 +65,36 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { // alternate path only for deterministic compute .. typedef AccumulationType_t CudaTAcc; - // find scratch buffer size needed by 'reduce_square_sum' for each tensor - int scratch_size = 0; + // find reduction buffer size needed by 'reduce_square_sum' for each tensor + size_t reduction_buffer_size = 0; for (int i = 0; i < total_tensor_count; ++i) { - scratch_size = std::max(scratch_size, compute_reduction_buffer_size(sizeof(CudaTAcc), tensor_sizes[i])); + reduction_buffer_size = + std::max(reduction_buffer_size, compute_reduction_buffer_size(tensor_sizes[i])); } - // enlarge scratch buffer size for 'reduce_sum' over tensor square norms - scratch_size = std::max(scratch_size, compute_reduction_buffer_size(sizeof(CudaTAcc), total_tensor_count)); - - // add head room for final output and square norms of each tensor - scratch_size += (1 + total_tensor_count) * sizeof(CudaTAcc); + // enlarge reduction buffer size for 'reduce_sum' over tensor square norms + reduction_buffer_size = + std::max(reduction_buffer_size, compute_reduction_buffer_size(total_tensor_count)); // create GPU scratch space and zero target for each tensor square norm - auto scratch_buffer = GetScratchBuffer(scratch_size); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(scratch_buffer.get(), 0, sizeof(CudaTAcc) * (1 + total_tensor_count))); + auto reduction_buffer = GetScratchBuffer(reduction_buffer_size); - CudaTAcc* p_global_sqnorm = reinterpret_cast(scratch_buffer.get()); + // buffer for final output and square norms of each tensor + auto results_buffer = GetScratchBuffer(1 + total_tensor_count); + + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(results_buffer.get(), 0, sizeof(CudaTAcc) * (1 + total_tensor_count))); + + CudaTAcc* p_global_sqnorm = results_buffer.get(); CudaTAcc* p_tensor_sqnorm = p_global_sqnorm + 1; - CudaTAcc* p_reduce_buffer = p_tensor_sqnorm + total_tensor_count; // perform reduction l2norm = sqrt[sum(tensor[i][j]**2)] for i,j over all tensor elements for (int i = 0; i < total_tensor_count; ++i) { CudaTIn* p_tensor_i = reinterpret_cast(grouped_tensor_pointers[i][0]); - reduce_square_sum(p_tensor_i, p_tensor_sqnorm + i, tensor_sizes[i], p_reduce_buffer); + ORT_RETURN_IF_ERROR(reduce_square_sum( + p_tensor_i, p_tensor_sqnorm + i, tensor_sizes[i], reduction_buffer.get(), reduction_buffer_size)); } - reduce_sum(p_tensor_sqnorm, p_global_sqnorm, total_tensor_count, p_reduce_buffer); + ORT_RETURN_IF_ERROR(reduce_sum( + p_tensor_sqnorm, p_global_sqnorm, total_tensor_count, reduction_buffer.get(), reduction_buffer_size)); ScalarSqrt(p_global_sqnorm, p_output); } diff --git a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu index 0f00e0fc5c..185018ed97 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu +++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu @@ -6,18 +6,19 @@ #include "core/providers/cuda/cuda_common.h" #include "core/providers/cuda/atomic/common.cuh" #include "core/providers/cuda/reduction/reduction_utils.cuh" +#include "core/providers/cuda/shared_inc/accumulation_type.h" namespace onnxruntime { namespace cuda { -template -__global__ void _ScalarSqrtImpl(Tin* input, Tout* output) { +template +__global__ void ScalarSqrtKernel(Tin* input, Tout* output) { *output = (Tout)_Sqrt(*input); }; -template +template void ScalarSqrt(Tin* input, Tout* output) { - _ScalarSqrtImpl<<<1, 1, 0>>>(input, output); + ScalarSqrtKernel<<<1, 1, 0>>>(input, output); }; template void ScalarSqrt(float* input, float* output); @@ -25,7 +26,7 @@ template void ScalarSqrt(half* input, half* output); template void ScalarSqrt(float* input, half* output); template -__global__ void _MultiTensorReduceImpl(ChunkGroup<1> chunk_group, TOut* output) { +__global__ void MultiTensorReduceKernel(ChunkGroup<1> chunk_group, TOut* output) { const int group_index = chunk_group.block_index_to_tensor_group_index[blockIdx.x]; const int tensor_size = chunk_group.tensor_sizes[group_index]; const int chunk_size = chunk_group.chunk_size; @@ -76,7 +77,7 @@ __global__ void _MultiTensorReduceImpl(ChunkGroup<1> chunk_group, TOut* output) } if (threadIdx.x == 0) { - atomic_add(w_norm, TOutOp()(shared_memory[0])); + atomic_add(w_norm, TOutOp()(TOut(shared_memory[0]))); } }; @@ -91,13 +92,13 @@ void MultiTensorReduce(ChunkGroup<1> chunk_group, TOut* output) { ORT_ENFORCE(thread_count % GPU_WARP_SIZE == 0); ORT_ENFORCE((thread_count & (thread_count - 1)) == 0); - _MultiTensorReduceImpl<<>>(chunk_group, output); + MultiTensorReduceKernel<<>>(chunk_group, output); } template void MultiTensorReduceL2::operator()(ChunkGroup<1> chunk_group, TOut* output) { - typedef typename ToBuffer::Type TBuf; - MultiTensorReduce, Cast>(chunk_group, output); + using TBuf = AccumulationType_t; + MultiTensorReduce(chunk_group, output); } #define INSTANTIATE_MULTI_TENSOR_REDUCTION_L2_FUNCTOR(TIn, TOut) \ diff --git a/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc b/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc index a9d701b368..ec01d5f5e5 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc +++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc @@ -59,7 +59,7 @@ Status ReduceKernel::ComputeImplEx(OpKernelContext* ctx, cudnn Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims); bool fast_reduction = fast_reduction_; if (fast_reduction) { - auto ctx_internal = static_cast(ctx); + auto ctx_internal = dynamic_cast(ctx); if (ctx_internal && ctx_internal->GetUseDeterministicCompute()) fast_reduction = false; }