mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-06 00:03:22 +00:00
Merged PR 5686: fix P100/fp16 issues
1. misaligned address in atomic_add() 2. GatherNDGradKernel to use atomic_add 3. enable/add UTs for GatherNDGrad and reduction_ops using half - __CUDA_ARCH__ won't take effect on .cc code, leverage HasCudaEnvironment() instead 4. verified convergence graph and perf test - p100 is much slower than v100 on fp16 - fp16/128 need to reduce batch size from 66 to 64 to avoid OOM issue 5. verify convergence test on Dev3/v100 TBD - broken UTs related to MatmulIntegerOpTest (works on v100/windows, though)
This commit is contained in:
parent
75025461e2
commit
2f1e997e5b
6 changed files with 96 additions and 22 deletions
|
|
@ -31,20 +31,23 @@ __device__ __forceinline__ void atomic_add(double *address, double value) {
|
|||
#endif
|
||||
}
|
||||
|
||||
//
|
||||
// ref: https://github.com/pytorch/pytorch/blob/master/aten/src/THC/THCAtomics.cuh
|
||||
//
|
||||
__device__ __forceinline__ void atomic_add(half *address, half value) {
|
||||
#if __CUDA_ARCH__ < 700
|
||||
half packed_old[2];
|
||||
half packed_new[2];
|
||||
int* const p_packed_old = reinterpret_cast<int*>(packed_old);
|
||||
int* const p_packed_new = reinterpret_cast<int*>(packed_new);
|
||||
int seen_old_value = 0;
|
||||
unsigned int* base_address = (unsigned int*)((char*)address - ((size_t)address & 2));
|
||||
unsigned int old = *base_address;
|
||||
unsigned int assumed;
|
||||
unsigned short x;
|
||||
|
||||
do {
|
||||
packed_old[0] = *address;
|
||||
packed_old[1] = *(address + 1);
|
||||
packed_new[0] = half(float(packed_old[0]) + float(value));
|
||||
packed_new[1] = packed_old[1];
|
||||
seen_old_value = atomicCAS(reinterpret_cast<int*>(address), *p_packed_old, *p_packed_new);
|
||||
} while (seen_old_value != *p_packed_old);
|
||||
assumed = old;
|
||||
x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
|
||||
x = __half_as_short(__float2half(__half2float(*reinterpret_cast<const __half*>(&x)) + __half2float(value)));
|
||||
old = (size_t)address & 2 ? (old & 0xffff) | (x << 16) : (old & 0xffff0000) | x;
|
||||
old = atomicCAS(base_address, assumed, old);
|
||||
} while (assumed != old);
|
||||
#else
|
||||
atomicAdd(address, value);
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -888,6 +888,72 @@ TEST(ReductionOpTest, ReduceSum_int32) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
#ifdef USE_CUDA
|
||||
TEST(ReductionOpTest, ReduceSumHalfHalf) {
|
||||
OpTester test("ReduceSum");
|
||||
test.AddAttribute("keepdims", (int64_t)0);
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0, 1});
|
||||
|
||||
std::vector<float> data = {1.0f, 2.0f,
|
||||
3.0f, 4.0f,
|
||||
|
||||
5.0f, 6.0f,
|
||||
7.0f, 8.0f,
|
||||
|
||||
9.0f, 10.0f,
|
||||
11.0f, 12.0f};
|
||||
std::vector<MLFloat16> data_half(12);
|
||||
ConvertFloatToMLFloat16(data.data(), data_half.data(), 12);
|
||||
|
||||
std::vector<float> result = {36.0f, 42.0f};
|
||||
std::vector<MLFloat16> result_half(2);
|
||||
ConvertFloatToMLFloat16(result.data(), result_half.data(), 2);
|
||||
|
||||
test.AddInput<MLFloat16>("data", {3, 2, 2}, data_half);
|
||||
test.AddOutput<MLFloat16>("reduced", {2}, result_half);
|
||||
test.Run();
|
||||
}
|
||||
|
||||
void test_half_reduce_sum(
|
||||
int64_t m, int64_t n) {
|
||||
OpTester test("ReduceSum");
|
||||
// Input tensor.
|
||||
std::vector<float> X(m * n, 0.0f);
|
||||
// Reduced tensor.
|
||||
std::vector<float> Y(n, 0.0f);
|
||||
// Random number generator.
|
||||
std::default_random_engine generator(0);
|
||||
std::uniform_real_distribution<float> distribution(0.0, 1.0);
|
||||
for (int64_t i = 0; i < m; ++i) {
|
||||
for (int64_t j = 0; j < n; ++j) {
|
||||
const float value = distribution(generator) / float(m);
|
||||
X[i * n + j] = value;
|
||||
Y[j] += value;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<MLFloat16> X_half(m * n);
|
||||
ConvertFloatToMLFloat16(X.data(), X_half.data(), int(m * n));
|
||||
|
||||
std::vector<MLFloat16> Y_half(n);
|
||||
ConvertFloatToMLFloat16(Y.data(), Y_half.data(), int(n));
|
||||
|
||||
test.AddAttribute("keepdims", (int64_t)0);
|
||||
test.AddAttribute("axes", std::vector<int64_t>{0});
|
||||
test.AddInput<MLFloat16>("data", {m, n}, X_half);
|
||||
test.AddOutput<MLFloat16>("reduced", {n}, Y_half);
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(ReductionOpTest, ReduceSum_half_bert) {
|
||||
test_half_reduce_sum(6 * 128, 128);
|
||||
test_half_reduce_sum(8 * 128, 128);
|
||||
test_half_reduce_sum(6 * 384, 128);
|
||||
test_half_reduce_sum(8 * 384, 128);
|
||||
}
|
||||
|
||||
// Add more UTs for half as needed
|
||||
#endif
|
||||
TEST(ReductionOpTest, ReduceSum_apex_reduction) {
|
||||
OpTester test("ReduceSum");
|
||||
test.AddAttribute("keepdims", (int64_t)0);
|
||||
|
|
|
|||
|
|
@ -197,7 +197,7 @@ void Check<MLFloat16>(const OpTester::Data& expected_data,
|
|||
}
|
||||
|
||||
float threshold = 0.001f;
|
||||
#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING)
|
||||
#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING) || defined(USE_CUDA)
|
||||
threshold = 0.005f;
|
||||
#endif
|
||||
for (int i = 0; i < size; ++i) {
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
#include "gtest/gtest.h"
|
||||
#include "test/providers/provider_test_utils.h"
|
||||
#include "test/common/cuda_op_test_utils.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace test {
|
||||
|
|
@ -231,8 +232,9 @@ TEST(GatherNDOpTest, GatherNDGrad_slice_float_int64_t_axis_1) {
|
|||
#endif
|
||||
|
||||
#ifdef USE_CUDA
|
||||
#if __CUDA_ARCH__ >= 600
|
||||
TEST(GatherNDOpTest, GatherNDGrad_slice_double_int32_t_axis_3) {
|
||||
if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return;
|
||||
|
||||
OpTester test("GatherNDGrad", 1, onnxruntime::kOnnxDomain);
|
||||
test.AddAttribute<int64_t>("axis", 1);
|
||||
test.AddInput<int64_t>("shape", {3}, {2LL, 2LL, 3LL});
|
||||
|
|
@ -243,6 +245,8 @@ TEST(GatherNDOpTest, GatherNDGrad_slice_double_int32_t_axis_3) {
|
|||
}
|
||||
|
||||
TEST(GatherNDOpTest, GatherND_slice_double_int64_t_axis_3) {
|
||||
if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return;
|
||||
|
||||
OpTester test("GatherND", 1, onnxruntime::kOnnxDomain);
|
||||
test.AddAttribute<int64_t>("axis", 1);
|
||||
test.AddInput<double>("data", {2, 2, 2}, ValueRange(8, 0.0, 0.1));
|
||||
|
|
@ -250,10 +254,10 @@ TEST(GatherNDOpTest, GatherND_slice_double_int64_t_axis_3) {
|
|||
test.AddOutput<double>("output", {2, 1, 2}, {0.2f, 0.3f, 0.4f, 0.5f});
|
||||
test.Run();
|
||||
}
|
||||
#endif
|
||||
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
TEST(GatherNDOpTest, GatherNDGrad_slice_half_int32_t_axis_3) {
|
||||
if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return;
|
||||
|
||||
OpTester test("GatherNDGrad", 1, onnxruntime::kOnnxDomain);
|
||||
test.AddAttribute<int64_t>("axis", 1);
|
||||
test.AddInput<int64_t>("shape", {3}, {2LL, 2LL, 3LL});
|
||||
|
|
@ -270,6 +274,8 @@ TEST(GatherNDOpTest, GatherNDGrad_slice_half_int32_t_axis_3) {
|
|||
}
|
||||
|
||||
TEST(GatherNDOpTest, GatherND_slice_half_int32_t) {
|
||||
if (!HasCudaEnvironment(600 /*min_cuda_architecture*/)) return;
|
||||
|
||||
OpTester test("GatherND", 1, onnxruntime::kOnnxDomain);
|
||||
std::vector<float> data_f({0.0f, 0.1f, 0.2f, 0.3f});
|
||||
std::vector<float> outputs_f({0.2f, 0.3f, 0.0f, 0.1f});
|
||||
|
|
@ -283,7 +289,6 @@ TEST(GatherNDOpTest, GatherND_slice_half_int32_t) {
|
|||
test.Run();
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef USE_CUDA
|
||||
TEST(GatherNDOpTest, GatherND_axis_of_2) {
|
||||
|
|
|
|||
|
|
@ -18,8 +18,8 @@ class LambOptimizer final : public CudaKernel {
|
|||
beta_ = info.GetAttrsOrDefault("beta", std::vector<float>(1024, 0.999f));
|
||||
lambda_ = info.GetAttrsOrDefault("lambda", std::vector<float>(1024, 0.0f));
|
||||
epsilon_ = info.GetAttrsOrDefault("epsilon", std::vector<float>(1024, 1e-6f));
|
||||
info.GetAttr("ratio_min", &ratio_min_);
|
||||
info.GetAttr("ratio_max", &ratio_max_);
|
||||
ORT_ENFORCE(info.GetAttr<float>("ratio_min", &ratio_min_).IsOK(), "Missing/Invalid 'ratio_min' attribute value");
|
||||
ORT_ENFORCE(info.GetAttr<float>("ratio_max", &ratio_max_).IsOK(), "Missing/Invalid 'ratio_max' attribute value");
|
||||
}
|
||||
|
||||
Status ComputeInternal(OpKernelContext* context) const override;
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
#include "orttraining/training_ops/cuda/tensor/gather_nd_impl.h"
|
||||
|
||||
#include "core/providers/cuda/cu_inc/common.cuh"
|
||||
#include "core/providers/cuda/atomic/common.cuh"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
|
@ -53,7 +54,7 @@ __global__ void _GatherNDGradKernel(
|
|||
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(i, num_slices * slice_size);
|
||||
uint64_t slice_offset = slice_offsets[i / slice_size];
|
||||
size_t j = i % slice_size;
|
||||
atomicAdd(output_data + slice_offset + j, update_data[i]);
|
||||
atomic_add(output_data + slice_offset + j, update_data[i]);
|
||||
};
|
||||
|
||||
template <typename TIndex>
|
||||
|
|
@ -121,11 +122,10 @@ SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(int64_t);
|
|||
|
||||
SPECIALIZED_IMPL(float);
|
||||
SPECIALIZED_GRAD_IMPL(float);
|
||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
|
||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
|
||||
SPECIALIZED_IMPL(half);
|
||||
SPECIALIZED_GRAD_IMPL(half);
|
||||
#endif
|
||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
|
||||
|
||||
SPECIALIZED_IMPL(double);
|
||||
SPECIALIZED_GRAD_IMPL(double);
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Reference in a new issue