Use __launch_bounds__ workaround, rather than limiting threads to 256 on AMD.

This commit is contained in:
Jesse Benson 2020-12-02 13:52:27 -08:00 committed by Jesse Benson
parent 98ea7372d3
commit 14f6eb14b1
3 changed files with 1 additions and 149 deletions

View file

@ -1,148 +0,0 @@
//
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
// NVIDIA/apex is licensed under the
// BSD 3 - Clause "New" or "Revised" License
//
/* Modifications Copyright (c) Microsoft. */
#pragma once
#include <vector>
namespace onnxruntime {
namespace rocm {
// initial reference from:
// https://github.com/NVIDIA/apex/blob/5b71d3695bf39efcdcda9dff5be2f70314b8f091/csrc/multi_tensor_apply.cuh#L15
// further experiment to get the number below. The larger the better, but if too large, it won't fit into GPU stack.
constexpr int ACTUAL_TENSOR_GROUP_SIZE[8] = {1, 1, 2, 3, 4, 5, 6, 7};
constexpr int MAX_BLOCK_COUNTS[8] = {256, 320, 320, 320, 320, 288, 288, 256};
constexpr int MAX_TENSOR_GROUP_COUNTS[8] = {1, 96, 64, 32, 32, 32, 32, 32};
constexpr int MAX_BLOCK_THREAD_COUNTS[8] = {256, 256, 256, 256, 256, 256, 256, 256};
// TensorGroupSize is the number of parallel tensors. For element-wise
// operators such as Relu, it should be 1. For two-operand operators such as
// element-wise addition, it should be 2. The value 0 is reserved for implementing
// kernels to handle a single large tensor.
template <int TensorGroupSize>
struct ChunkGroup {
// Number of chunks in this ChunkGroup.
// It's the effective size of block_index_to_tensor_group_index and
// block_index_to_chunk_start_index.
// The i-th chunk starts at the block_index_to_chunk_start_index[i]-th
// element in the block_index_to_tensor_group_index[i]-th tensor.
int chunk_count = 0;
// Max number of elements in each chunk in this ChunkGroup.
// It's an upper bound because chunks locating in the end of tensors
// are not always full. For example, if we split a 7-element vector into
// two 4-element chunks, the second chunk may contain only 3 actual values.
int chunk_size = 0;
// blkIdx.x block processes chunks in block_index_to_tensor_group_index[blkIdx.x]-th
// tensor group. Each chunk starts from block_index_to_chunk_start_index[blkIdx.x]-th
// element until reaching the end of this chunk or the end of the whole tensor.
//
// Let i = block_index_to_tensor_group_index[blkIdx.x]
// n = tensor_sizes[i]
// b = block_index_to_chunk_start_index[blkIdx.x]
// e = min(b + chunk_size, n)
// The valid index range for blockIdx.x is defined by the following equation.
// b <= valid index < e
int block_index_to_tensor_group_index[MAX_BLOCK_COUNTS[TensorGroupSize]];
int block_index_to_chunk_start_index[MAX_BLOCK_COUNTS[TensorGroupSize]];
int tensor_sizes[MAX_TENSOR_GROUP_COUNTS[TensorGroupSize]];
// The addresses of tensors where the chunks are extracted from.
// 1. tensor_ptrs[0][i], ..., tensor_ptrs[TensorGroupSize-1][i] are
// the tensors' pointers in the i-th group.
// 2. All tensors in the i-th group have the same size, tensor_sizes[i].
void* tensor_ptrs[ACTUAL_TENSOR_GROUP_SIZE[TensorGroupSize]][MAX_TENSOR_GROUP_COUNTS[TensorGroupSize]];
// Max number of GPU blocks to process the chunks in this chunk group.
const static int max_block_count = MAX_BLOCK_COUNTS[TensorGroupSize];
// Max number of tensor groups in this chunk group.
const static int max_tensor_group_count = MAX_TENSOR_GROUP_COUNTS[TensorGroupSize];
// The suggested number of threads to launch per GPU block.
const static int thread_count_per_block = MAX_BLOCK_THREAD_COUNTS[TensorGroupSize];
};
template <int TensorGroupSize>
int compute_max_tensor_size_per_launch(int element_count_per_thread) {
constexpr int block_count =
ChunkGroup<TensorGroupSize>::max_block_count;
constexpr int thread_count_per_block =
ChunkGroup<TensorGroupSize>::thread_count_per_block;
return block_count * thread_count_per_block * element_count_per_thread;
}
template <int TensorGroupSize, typename TMultiTensorFunctor, typename... TFunctorParams>
void launch_multi_tensor_functor(
const int chunk_size,
std::vector<int>& tensor_sizes,
std::vector<std::vector<void*>>& grouped_tensor_pointers,
TMultiTensorFunctor multipleTensorKernel,
TFunctorParams... kernelParams) {
ORT_ENFORCE(tensor_sizes.size() > 0);
ORT_ENFORCE(tensor_sizes.size() < static_cast<size_t>(std::numeric_limits<int>::max()));
ORT_ENFORCE(grouped_tensor_pointers.size() > 0);
ORT_ENFORCE(grouped_tensor_pointers.size() < static_cast<size_t>(std::numeric_limits<int>::max()));
ORT_ENFORCE(chunk_size > 0);
// Number of groups, for example, the number of updated weight tensors in Lamb optimizer.
const int group_count = static_cast<int>(grouped_tensor_pointers.size());
// Tensor count per group.
const int group_size = static_cast<int>(grouped_tensor_pointers[0].size());
int tensor_group_index = 0;
int block_index = 0;
// Check if 32-bit integer is enough.
ORT_ENFORCE(tensor_sizes.size() < static_cast<size_t>(std::numeric_limits<int>::max()));
ORT_ENFORCE(grouped_tensor_pointers.size() == tensor_sizes.size());
ORT_ENFORCE(group_size == ACTUAL_TENSOR_GROUP_SIZE[TensorGroupSize]);
for (int i = 0; i < group_count; ++i) {
ORT_ENFORCE(grouped_tensor_pointers[i].size() == static_cast<size_t>(group_size));
}
// Handle multiple tensors per ROCM kernel call.
ChunkGroup<TensorGroupSize> chunk_group;
for (int i = 0; i < group_count; ++i) {
// Add pointers to one group of tensors into chunk_group.
for (int j = 0; j < group_size; ++j) {
chunk_group.tensor_ptrs[j][tensor_group_index] = grouped_tensor_pointers[i][j];
}
// Assuming that all tensors' shapes are the same, we just record w's size.
chunk_group.tensor_sizes[tensor_group_index] = tensor_sizes[i];
chunk_group.chunk_size = chunk_size;
const int chunk_count = (tensor_sizes[i] + chunk_size - 1) / chunk_size;
// Process all chunks in this tensor group.
for (int chunk_index = 0; chunk_index < chunk_count; ++chunk_index) {
chunk_group.block_index_to_tensor_group_index[block_index] = tensor_group_index;
chunk_group.block_index_to_chunk_start_index[block_index] = chunk_index * chunk_size;
// After ++block_index, block_index becomes the count of chunks in chunk_group.
++block_index;
chunk_group.chunk_count = block_index;
if (block_index == chunk_group.max_block_count) {
multipleTensorKernel(chunk_group, kernelParams...);
block_index = 0;
}
}
// After ++tensor_group_index, tensor_group_index becomes the count of tensor group in chunk_group.
++tensor_group_index;
if (tensor_group_index == chunk_group.max_tensor_group_count) {
multipleTensorKernel(chunk_group, kernelParams...);
block_index = 0;
tensor_group_index = 0;
}
}
// This round of processing tensor group is finished.
// All the groups remain in chunk group should be processed right now.
if (block_index != 0) {
multipleTensorKernel(chunk_group, kernelParams...);
block_index = 0;
tensor_group_index = 0;
}
}
} // namespace rocm
} // namespace onnxruntime

View file

@ -441,6 +441,7 @@ INSTANTIATE_LAMB_MULTI_TENSOR_UPDATE_FUNCTOR(half, float, half, half)
INSTANTIATE_LAMB_MULTI_TENSOR_UPDATE_FUNCTOR(float, float, half, half)
template <typename TIn1, typename TIn2, typename TOut1, typename TOut2, typename TBuf>
__launch_bounds__(ChunkGroup<4>::thread_count_per_block)
__global__ void LambMultiTensorReductionImpl(ChunkGroup<4> chunk_group) {
const int group_index = chunk_group.block_index_to_tensor_group_index[blockIdx.x];
const int tensor_size = chunk_group.tensor_sizes[group_index];

View file

@ -111,7 +111,6 @@ core_ops_files = [
'math/variadic_elementwise_ops.h',
'math/variadic_elementwise_ops_impl.cu',
'math/variadic_elementwise_ops_impl.h',
'multi_tensor/common.cuh',
'nn/batch_norm.cc',
'nn/batch_norm.h',
'nn/conv.cc',