Reduction ops.

This commit is contained in:
Jesse Benson 2021-01-14 18:17:06 -08:00 committed by Jesse Benson
parent 196132925e
commit a28ddb85b6
3 changed files with 114 additions and 14 deletions

View file

@ -64,32 +64,36 @@ Status ReduceAllL2<TIn, TOut>::ComputeInternal(OpKernelContext* ctx) const {
// alternate path only for deterministic compute ..
typedef AccumulationType_t<HipTOut> HipTAcc;
// find scratch buffer size needed by 'reduce_square_sum' for each tensor
int scratch_size = 0;
// find reduction buffer size needed by 'reduce_square_sum' for each tensor
size_t reduction_buffer_size = 0;
for (int i = 0; i < total_tensor_count; ++i) {
scratch_size = std::max(scratch_size, compute_reduction_buffer_size(sizeof(HipTAcc), tensor_sizes[i]));
reduction_buffer_size =
std::max(reduction_buffer_size, compute_reduction_buffer_size<HipTAcc>(tensor_sizes[i]));
}
// enlarge scratch buffer size for 'reduce_sum' over tensor square norms
scratch_size = std::max(scratch_size, compute_reduction_buffer_size(sizeof(HipTAcc), total_tensor_count));
// add head room for final output and square norms of each tensor
scratch_size += (1 + total_tensor_count) * sizeof(HipTAcc);
// enlarge reduction buffer size for 'reduce_sum' over tensor square norms
reduction_buffer_size =
std::max(reduction_buffer_size, compute_reduction_buffer_size<HipTAcc>(total_tensor_count));
// create GPU scratch space and zero target for each tensor square norm
auto scratch_buffer = GetScratchBuffer<uint8_t>(scratch_size);
HIP_RETURN_IF_ERROR(hipMemsetAsync(scratch_buffer.get(), 0, sizeof(HipTAcc) * (1 + total_tensor_count)));
auto reduction_buffer = GetScratchBuffer<void>(reduction_buffer_size);
HipTAcc* p_global_sqnorm = reinterpret_cast<HipTAcc*>(scratch_buffer.get());
// buffer for final output and square norms of each tensor
auto results_buffer = GetScratchBuffer<HipTAcc>(1 + total_tensor_count);
HIP_RETURN_IF_ERROR(hipMemsetAsync(results_buffer.get(), 0, sizeof(HipTAcc) * (1 + total_tensor_count)));
HipTAcc* p_global_sqnorm = results_buffer.get();
HipTAcc* p_tensor_sqnorm = p_global_sqnorm + 1;
HipTAcc* p_reduce_buffer = p_tensor_sqnorm + total_tensor_count;
// perform reduction l2norm = sqrt[sum(tensor[i][j]**2)] for i,j over all tensor elements
for (int i = 0; i < total_tensor_count; ++i) {
HipTIn* p_tensor_i = reinterpret_cast<HipTIn*>(grouped_tensor_pointers[i][0]);
reduce_square_sum(p_tensor_i, p_tensor_sqnorm + i, tensor_sizes[i], p_reduce_buffer);
ORT_RETURN_IF_ERROR(reduce_square_sum(
p_tensor_i, p_tensor_sqnorm + i, tensor_sizes[i], reduction_buffer.get(), reduction_buffer_size));
}
reduce_sum(p_tensor_sqnorm, p_global_sqnorm, total_tensor_count, p_reduce_buffer);
ORT_RETURN_IF_ERROR(reduce_sum(
p_tensor_sqnorm, p_global_sqnorm, total_tensor_count, reduction_buffer.get(), reduction_buffer_size));
ScalarSqrt(p_global_sqnorm, p_output);
}

View file

@ -63,5 +63,97 @@ Status ReduceKernel<allow_multi_axes>::ComputeImplEx(OpKernelContext* ctx, miope
calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction);
}
template <>
template <>
Status ReduceKernel<true>::ComputeImplEx<int32_t, MIOPEN_REDUCE_TENSOR_NO_INDICES>(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const {
typedef typename ToHipType<int32_t>::MappedType HipT;
const Tensor* X = ctx->Input<Tensor>(0);
//override the attribute value with the input value for reduction_axes
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor.");
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
const auto* data = axes_tensor->template Data<int64_t>();
std::vector<int64_t> axes(data, data + nDims);
// empty axes and no-op
if (axes.empty() && noop_with_empty_axes_) {
auto* Y = ctx->Output(0, X->Shape());
HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData<int32_t>(), X->template Data<int32_t>(), X->SizeInBytes(), hipMemcpyDeviceToDevice));
return Status::OK();
}
PrepareReduceMetadata prepare_reduce_metadata;
ORT_RETURN_IF_ERROR(PrepareForReduce(X,
keepdims_,
axes,
prepare_reduce_metadata));
Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
int64_t input_count = prepare_reduce_metadata.input_count;
int64_t output_count = prepare_reduce_metadata.output_count;
std::vector<int64_t>& input_dims_miopen = prepare_reduce_metadata.input_dims_miopen;
std::vector<int64_t>& output_dims_miopen = prepare_reduce_metadata.output_dims_miopen;
// special case when there is a dim value of 0 in the shape.
if (input_count == 0) {
assert(Y->Shape().Size() == 0);
return Status::OK();
}
// miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case
if (input_count == output_count) {
if (Y->template MutableData<int32_t>() != X->template Data<int32_t>()) {
HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData<int32_t>(), X->template Data<int32_t>(), input_count * sizeof(int32_t), hipMemcpyDeviceToDevice));
}
return Status::OK();
}
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
// Therefore zeroing out the memory is required
HIP_RETURN_IF_ERROR(hipMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes()));
size_t indices_bytes = 0;
size_t workspace_bytes = 0;
MiopenTensor input_tensor;
MiopenTensor output_tensor;
MiopenReduceDescriptor reduce_desc;
miopenDataType_t miopen_type_X = miopenFloat;
IAllocatorUniquePtr<float> temp_X = GetScratchBuffer<float>(input_count);
Impl_Cast<HipT, float>(reinterpret_cast<const HipT*>(X->template Data<int32_t>()), temp_X.get(), X->Shape().Size());
ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES));
ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X));
ORT_RETURN_IF_ERROR(output_tensor.Set(output_dims_miopen, miopen_type_X));
MIOPEN_RETURN_IF_ERROR(miopenGetReductionIndicesSize(MiopenHandle(), reduce_desc, input_tensor, output_tensor, &indices_bytes));
MIOPEN_RETURN_IF_ERROR(miopenGetReductionWorkspaceSize(MiopenHandle(), reduce_desc, input_tensor, output_tensor, &workspace_bytes));
IAllocatorUniquePtr<uint32_t> indices_miopen = GetScratchBuffer<uint32_t>(indices_bytes);
IAllocatorUniquePtr<HipT> workspace_miopen = GetScratchBuffer<HipT>(workspace_bytes);
const auto one = Consts<float>::One;
const auto zero = Consts<float>::Zero;
auto temp_Y = GetScratchBuffer<float>(output_count);
MIOPEN_RETURN_IF_ERROR(miopenReduceTensor(MiopenHandle(),
reduce_desc,
indices_miopen.get(),
indices_bytes,
workspace_miopen.get(),
workspace_bytes,
&one,
input_tensor,
temp_X.get(),
&zero,
output_tensor,
temp_Y.get()));
Impl_Cast<float, int32_t>(temp_Y.get(), Y->template MutableData<int32_t>(), output_count);
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime

View file

@ -32,6 +32,7 @@ ReductionOpTest.ReduceL1_do_not_keep_dims_2
ReductionOpTest.ReduceL1_keepdims
ReductionOpTest.ReduceL1
ReductionOpTest.ReduceL1_int32
ReductionOpTest.ReduceL10DTensor
ReductionOpTest.ReduceL2_default_axes_keepdims
ReductionOpTest.ReduceL2_default_axes_do_not_keep_dims
ReductionOpTest.ReduceL2_do_not_keepdims
@ -39,6 +40,7 @@ ReductionOpTest.ReduceL2_do_not_keepdims_2
ReductionOpTest.ReduceL2_keepdims
ReductionOpTest.ReduceL2
ReductionOpTest.ReduceL2_int32
ReductionOpTest.ReduceL20DTensor
ReductionOpTest.ReduceLogSum
ReductionOpTest.ReduceLogSum_samesize
ReductionOpTest.ReduceLogSum_do_not_keepdims_2
@ -76,6 +78,8 @@ ReductionOpTest.ReduceMean_keepdims_double
ReductionOpTest.ReduceMean
ReductionOpTest.ReduceMean_double
ReductionOpTest.ReduceMean_int32
ReductionOpTest.ReduceMean0DTensor
ReductionOpTest.ReduceMean0DTensor_double
ReductionOpTest.ReduceMin_default_axes_keepdims
ReductionOpTest.ReduceMin_default_axes_do_not_keep_dims
ReductionOpTest.ReduceMin_default_axes_do_not_keep_dims_2D