Merged PR 5686: fix P100/fp16 issues

1. misaligned address in atomic_add()
2. GatherNDGradKernel to use atomic_add
3. enable/add UTs for GatherNDGrad and reduction_ops using half
- __CUDA_ARCH__ won't take effect on .cc code, leverage HasCudaEnvironment() instead
4. verified convergence graph and perf test
- p100 is much slower than v100 on fp16
- fp16/128 need to reduce batch size from 66 to 64 to avoid OOM issue
5. verify convergence test on Dev3/v100

TBD - broken UTs related to MatmulIntegerOpTest (works on v100/windows, though)
This commit is contained in:
Ethan Tao 2020-03-12 04:50:09 +00:00 committed by edgchen1
parent 75025461e2
commit 2f1e997e5b
6 changed files with 96 additions and 22 deletions

View file

@ -31,20 +31,23 @@ __device__ __forceinline__ void atomic_add(double *address, double value) {
#endif
}
//
// ref: https://github.com/pytorch/pytorch/blob/master/aten/src/THC/THCAtomics.cuh
//
__device__ __forceinline__ void atomic_add(half *address, half value) {
#if __CUDA_ARCH__ < 700
half packed_old[2];
half packed_new[2];
int* const p_packed_old = reinterpret_cast<int*>(packed_old);
int* const p_packed_new = reinterpret_cast<int*>(packed_new);
int seen_old_value = 0;
unsigned int* base_address = (unsigned int*)((char*)address - ((size_t)address & 2));
unsigned int old = *base_address;
unsigned int assumed;
unsigned short x;
do {
packed_old[0] = *address;
packed_old[1] = *(address + 1);
packed_new[0] = half(float(packed_old[0]) + float(value));
packed_new[1] = packed_old[1];
seen_old_value = atomicCAS(reinterpret_cast<int*>(address), *p_packed_old, *p_packed_new);
} while (seen_old_value != *p_packed_old);
assumed = old;
x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
x = __half_as_short(__float2half(__half2float(*reinterpret_cast<const __half*>(&x)) + __half2float(value)));
old = (size_t)address & 2 ? (old & 0xffff) | (x << 16) : (old & 0xffff0000) | x;
old = atomicCAS(base_address, assumed, old);
} while (assumed != old);
#else
atomicAdd(address, value);
#endif

View file

@ -888,6 +888,72 @@ TEST(ReductionOpTest, ReduceSum_int32) {
test.Run();
}
#ifdef USE_CUDA
TEST(ReductionOpTest, ReduceSumHalfHalf) {
OpTester test("ReduceSum");
test.AddAttribute("keepdims", (int64_t)0);
test.AddAttribute("axes", std::vector<int64_t>{0, 1});
std::vector<float> data = {1.0f, 2.0f,
3.0f, 4.0f,
5.0f, 6.0f,
7.0f, 8.0f,
9.0f, 10.0f,
11.0f, 12.0f};
std::vector<MLFloat16> data_half(12);
ConvertFloatToMLFloat16(data.data(), data_half.data(), 12);
std::vector<float> result = {36.0f, 42.0f};
std::vector<MLFloat16> result_half(2);
ConvertFloatToMLFloat16(result.data(), result_half.data(), 2);
test.AddInput<MLFloat16>("data", {3, 2, 2}, data_half);
test.AddOutput<MLFloat16>("reduced", {2}, result_half);
test.Run();
}
void test_half_reduce_sum(
int64_t m, int64_t n) {
OpTester test("ReduceSum");
// Input tensor.
std::vector<float> X(m * n, 0.0f);
// Reduced tensor.
std::vector<float> Y(n, 0.0f);
// Random number generator.
std::default_random_engine generator(0);
std::uniform_real_distribution<float> distribution(0.0, 1.0);
for (int64_t i = 0; i < m; ++i) {
for (int64_t j = 0; j < n; ++j) {
const float value = distribution(generator) / float(m);
X[i * n + j] = value;
Y[j] += value;
}
}
std::vector<MLFloat16> X_half(m * n);
ConvertFloatToMLFloat16(X.data(), X_half.data(), int(m * n));
std::vector<MLFloat16> Y_half(n);
ConvertFloatToMLFloat16(Y.data(), Y_half.data(), int(n));
test.AddAttribute("keepdims", (int64_t)0);
test.AddAttribute("axes", std::vector<int64_t>{0});
test.AddInput<MLFloat16>("data", {m, n}, X_half);
test.AddOutput<MLFloat16>("reduced", {n}, Y_half);
test.Run();
}
TEST(ReductionOpTest, ReduceSum_half_bert) {
test_half_reduce_sum(6 * 128, 128);
test_half_reduce_sum(8 * 128, 128);
test_half_reduce_sum(6 * 384, 128);
test_half_reduce_sum(8 * 384, 128);
}
// Add more UTs for half as needed
#endif
TEST(ReductionOpTest, ReduceSum_apex_reduction) {
OpTester test("ReduceSum");
test.AddAttribute("keepdims", (int64_t)0);

View file

@ -197,7 +197,7 @@ void Check<MLFloat16>(const OpTester::Data& expected_data,
}
float threshold = 0.001f;
#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING)
#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING) || defined(USE_CUDA)
threshold = 0.005f;
#endif
for (int i = 0; i < size; ++i) {

View file

@ -3,6 +3,7 @@
#include "gtest/gtest.h"
#include "test/providers/provider_test_utils.h"
#include "test/common/cuda_op_test_utils.h"
namespace onnxruntime {
namespace test {
@ -231,8 +232,9 @@ TEST(GatherNDOpTest, GatherNDGrad_slice_float_int64_t_axis_1) {
#endif
#ifdef USE_CUDA
#if __CUDA_ARCH__ >= 600
TEST(GatherNDOpTest, GatherNDGrad_slice_double_int32_t_axis_3) {
if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return;
OpTester test("GatherNDGrad", 1, onnxruntime::kOnnxDomain);
test.AddAttribute<int64_t>("axis", 1);
test.AddInput<int64_t>("shape", {3}, {2LL, 2LL, 3LL});
@ -243,6 +245,8 @@ TEST(GatherNDOpTest, GatherNDGrad_slice_double_int32_t_axis_3) {
}
TEST(GatherNDOpTest, GatherND_slice_double_int64_t_axis_3) {
if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return;
OpTester test("GatherND", 1, onnxruntime::kOnnxDomain);
test.AddAttribute<int64_t>("axis", 1);
test.AddInput<double>("data", {2, 2, 2}, ValueRange(8, 0.0, 0.1));
@ -250,10 +254,10 @@ TEST(GatherNDOpTest, GatherND_slice_double_int64_t_axis_3) {
test.AddOutput<double>("output", {2, 1, 2}, {0.2f, 0.3f, 0.4f, 0.5f});
test.Run();
}
#endif
#if __CUDA_ARCH__ >= 700
TEST(GatherNDOpTest, GatherNDGrad_slice_half_int32_t_axis_3) {
if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return;
OpTester test("GatherNDGrad", 1, onnxruntime::kOnnxDomain);
test.AddAttribute<int64_t>("axis", 1);
test.AddInput<int64_t>("shape", {3}, {2LL, 2LL, 3LL});
@ -270,6 +274,8 @@ TEST(GatherNDOpTest, GatherNDGrad_slice_half_int32_t_axis_3) {
}
TEST(GatherNDOpTest, GatherND_slice_half_int32_t) {
if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return;
OpTester test("GatherND", 1, onnxruntime::kOnnxDomain);
std::vector<float> data_f({0.0f, 0.1f, 0.2f, 0.3f});
std::vector<float> outputs_f({0.2f, 0.3f, 0.0f, 0.1f});
@ -283,7 +289,6 @@ TEST(GatherNDOpTest, GatherND_slice_half_int32_t) {
test.Run();
}
#endif
#endif
#ifdef USE_CUDA
TEST(GatherNDOpTest, GatherND_axis_of_2) {

View file

@ -18,8 +18,8 @@ class LambOptimizer final : public CudaKernel {
beta_ = info.GetAttrsOrDefault("beta", std::vector<float>(1024, 0.999f));
lambda_ = info.GetAttrsOrDefault("lambda", std::vector<float>(1024, 0.0f));
epsilon_ = info.GetAttrsOrDefault("epsilon", std::vector<float>(1024, 1e-6f));
info.GetAttr("ratio_min", &ratio_min_);
info.GetAttr("ratio_max", &ratio_max_);
ORT_ENFORCE(info.GetAttr<float>("ratio_min", &ratio_min_).IsOK(), "Missing/Invalid 'ratio_min' attribute value");
ORT_ENFORCE(info.GetAttr<float>("ratio_max", &ratio_max_).IsOK(), "Missing/Invalid 'ratio_max' attribute value");
}
Status ComputeInternal(OpKernelContext* context) const override;

View file

@ -4,6 +4,7 @@
#include "orttraining/training_ops/cuda/tensor/gather_nd_impl.h"
#include "core/providers/cuda/cu_inc/common.cuh"
#include "core/providers/cuda/atomic/common.cuh"
namespace onnxruntime {
namespace cuda {
@ -53,7 +54,7 @@ __global__ void _GatherNDGradKernel(
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(i, num_slices * slice_size);
uint64_t slice_offset = slice_offsets[i / slice_size];
size_t j = i % slice_size;
atomicAdd(output_data + slice_offset + j, update_data[i]);
atomic_add(output_data + slice_offset + j, update_data[i]);
};
template <typename TIndex>
@ -121,11 +122,10 @@ SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(int64_t);
SPECIALIZED_IMPL(float);
SPECIALIZED_GRAD_IMPL(float);
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
SPECIALIZED_IMPL(half);
SPECIALIZED_GRAD_IMPL(half);
#endif
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
SPECIALIZED_IMPL(double);
SPECIALIZED_GRAD_IMPL(double);
#endif