From c71c49aaa03a07ee4c05ffb6a9af3f150bce3c1d Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Mon, 13 Jul 2020 09:59:56 -0700 Subject: [PATCH] Make TArray safer to use and update method name for consistency. (#4483) - make size_ and data_ data members private - rename GetCapacity() to Capacity() to be consistent (e.g., with Size()) - add static_assert for trivially copyable T because it is copied with memcpy --- .../contrib_ops/cuda/math/complex_mul_impl.cu | 12 +- .../cuda/cu_inc/binary_elementwise_impl.cuh | 2 +- .../providers/cuda/shared_inc/cuda_utils.h | 5 +- .../core/providers/cuda/tensor/expand_impl.cu | 16 +-- .../core/providers/cuda/tensor/resize_impl.cu | 18 +-- .../cuda/tensor/scatter_elements_impl.cu | 2 +- .../core/providers/cuda/tensor/slice.cc | 2 +- .../core/providers/cuda/tensor/slice_impl.cu | 8 +- .../providers/cuda/tensor/transpose_impl.cu | 45 +++---- .../core/providers/cuda/tensor/where.cc | 12 +- .../core/providers/cuda/tensor/where_impl.cu | 120 +++++++++--------- .../training_ops/cuda/math/div_grad_impl.cu | 104 +++++++-------- 12 files changed, 174 insertions(+), 172 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu b/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu index 5b75df928e..0004cf9433 100644 --- a/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu +++ b/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu @@ -53,18 +53,18 @@ __global__ void _ElementWiseWithStrideTwo( // compute indexes with broadcasting rules: https://github.com/onnx/onnx/blob/master/docs/Broadcasting.md CUDA_LONG offset = id; #pragma unroll - for (auto dim = 0; dim < fdm_output_strides.GetCapacity(); dim++) { + for (auto dim = 0; dim < fdm_output_strides.Capacity(); dim++) { if (dim >= output_rank) { break; } int q, r; - fdm_output_strides.data_[dim].divmod(offset, q, r); + fdm_output_strides[dim].divmod(offset, q, r); if (lhs_need_compute) { - lhs_index += static_cast(lhs_padded_strides.data_[dim]) * q; + lhs_index += static_cast(lhs_padded_strides[dim]) * q; } if (rhs_need_compute) { - rhs_index += static_cast(rhs_padded_strides.data_[dim]) * q; + rhs_index += static_cast(rhs_padded_strides[dim]) * q; } offset = r; } @@ -109,7 +109,7 @@ void ComplexMul_Impl( int blocksPerGrid = static_cast(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); CUDA_LONG N = static_cast(count); - if (lhs_padded_strides && rhs_padded_strides && lhs_padded_strides->size_ && rhs_padded_strides->size_) + if (lhs_padded_strides && rhs_padded_strides && lhs_padded_strides->Size() && rhs_padded_strides->Size()) _ElementWiseWithStrideTwo<<>>( output_rank_or_simple_broadcast, *lhs_padded_strides, @@ -122,7 +122,7 @@ void ComplexMul_Impl( lhs_size, rhs_size, is_conj); - else if (lhs_padded_strides && lhs_padded_strides->size_) + else if (lhs_padded_strides && lhs_padded_strides->Size()) _ElementWiseWithStrideTwo<<>>( output_rank_or_simple_broadcast, *lhs_padded_strides, diff --git a/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh b/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh index ca71d24b31..8fa38813e7 100644 --- a/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh +++ b/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh @@ -34,7 +34,7 @@ __global__ void _BinaryElementWise( // compute indexes with broadcasting rules: https://github.com/onnx/onnx/blob/master/docs/Broadcasting.md CUDA_LONG offset = id; #pragma unroll - for (auto dim = 0; dim < fdm_output_strides.GetCapacity(); dim++) { + for (auto dim = 0; dim < fdm_output_strides.Capacity(); dim++) { if (dim >= output_rank) { break; } diff --git a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h index 929e03203a..5a5303f45f 100644 --- a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h +++ b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h @@ -57,6 +57,7 @@ struct TArray { } TArray(const std::vector& vec) : TArray(static_cast(vec.size())) { + static_assert(std::is_trivially_copyable::value, "T must be trivially copyable."); memcpy(data_, vec.data(), vec.size() * sizeof(T)); } @@ -87,9 +88,9 @@ struct TArray { return data_; } - static constexpr int32_t GetCapacity() { return capacity; }; + static constexpr int32_t Capacity() { return capacity; }; - public: // TODO make these private + private: int32_t size_; T data_[capacity]; }; diff --git a/onnxruntime/core/providers/cuda/tensor/expand_impl.cu b/onnxruntime/core/providers/cuda/tensor/expand_impl.cu index aafb6b701e..79a7dababa 100644 --- a/onnxruntime/core/providers/cuda/tensor/expand_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/expand_impl.cu @@ -63,7 +63,7 @@ __global__ void ExpandKernel( CUDA_LONG index = 0; CUDA_LONG offset = id; #pragma unroll - for (auto dim = 0; dim < output_strides.GetCapacity(); dim++) { + for (auto dim = 0; dim < output_strides.Capacity(); dim++) { if (dim >= rank) { break; } @@ -143,7 +143,7 @@ Status ExpandImpl( void* output_data, const TArray& output_strides, const TArray& input_strides) { - const int rank = static_cast(output_strides.size_); + const int rank = static_cast(output_strides.Size()); if (rank == 1) { if (N_input == N_output) { CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, N_output * element_size, cudaMemcpyDeviceToDevice)); @@ -159,12 +159,12 @@ Status ExpandImpl( int blocksPerGrid = gsl::narrow_cast(CeilDiv(N_output, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); -#define EXPAND_ON(TYPE) \ - case sizeof(TYPE): \ - ExpandKernel \ - <<>>( \ - rank, N_output, reinterpret_cast(input_data), reinterpret_cast(output_data), \ - output_strides, input_strides); \ +#define EXPAND_ON(TYPE) \ + case sizeof(TYPE): \ + ExpandKernel \ + <<>>( \ + rank, N_output, reinterpret_cast(input_data), reinterpret_cast(output_data), \ + output_strides, input_strides); \ break switch (element_size) { diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu index d87c0a3c3e..5bb720aebf 100644 --- a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu @@ -467,7 +467,7 @@ void ResizeNearestImpl( bool could2d = rank >= 2 && transform_coordinate != GetDeviceOriginalCoordinateFunc(ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE) && - std::all_of(scales_vals.data_, scales_vals.data_ + (rank - 2), [](float v) { return v == 1.0; }); + std::all_of(scales_vals.Data(), scales_vals.Data() + (rank - 2), [](float v) { return v == 1.0; }); if (could2d) { int64_t output_height = output_shape[rank - 2]; int64_t output_width = output_shape[rank - 1]; @@ -502,7 +502,7 @@ void ResizeNearestImpl( return; } - int64_t total_dim_sum = std::accumulate(output_shape.data_, output_shape.data_ + rank, 0); + int64_t total_dim_sum = std::accumulate(output_shape.Data(), output_shape.Data() + rank, 0); int blocksPerDimsMappingGrid = (int)(ceil(static_cast(total_dim_sum) / 32)); _ResizeNearestMappingKernel<<>>( rank, input_shape, output_shape, @@ -540,7 +540,7 @@ void ResizeImpl( ResizeCoordinateTransformationMode coordinate_transform_mode, ResizeNearestMode nearest_mode, void* dims_mapping) { - bool isSame = std::all_of(scales_vals.data_, scales_vals.data_ + rank, [](float v) { return v == 1.0f; }) && + bool isSame = std::all_of(scales_vals.Data(), scales_vals.Data() + rank, [](float v) { return v == 1.0f; }) && (coordinate_transform_mode != ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE); if (isSame) { cudaMemcpyAsync(output_data, input_data, N * sizeof(T), cudaMemcpyDeviceToDevice); @@ -606,12 +606,12 @@ void ResizeImpl( template void ResizeImpl( \ const UpsampleMode upsample_mode, \ const int rank, \ - TArray& input_shape, \ - TArray& output_shape, \ - TArray& input_strides, \ - TArray& output_div_pitches, \ - TArray& scales_vals, \ - TArray& roi_vals, \ + TArray& input_shape, \ + TArray& output_shape, \ + TArray& input_strides, \ + TArray& output_div_pitches, \ + TArray& scales_vals, \ + TArray& roi_vals, \ const T* input_data, \ T* output_data, \ const size_t N, \ diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.cu b/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.cu index 0bf51f741d..da185a257a 100755 --- a/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.cu @@ -192,7 +192,7 @@ Status ScatterElementsImplInternal( std::vector eff_input_dims; std::vector eff_indices_dims; int new_axis = CompactInputIndicesDims( - rank, axis, buffer_input_dims.data_, buffer_indices_dims.data_, eff_input_dims, eff_indices_dims); + rank, axis, buffer_input_dims.Data(), buffer_indices_dims.Data(), eff_input_dims, eff_indices_dims); if (eff_input_dims.size() == 2) { return ScatterElementsImpl2D( input_data, eff_input_dims, indices_data, indices_size, eff_indices_dims, updates, new_axis, output_data, diff --git a/onnxruntime/core/providers/cuda/tensor/slice.cc b/onnxruntime/core/providers/cuda/tensor/slice.cc index a44d5ee794..a433db34fe 100644 --- a/onnxruntime/core/providers/cuda/tensor/slice.cc +++ b/onnxruntime/core/providers/cuda/tensor/slice.cc @@ -101,7 +101,7 @@ Status Slice::ComputeInternal(OpKernelContext* ctx) const { TArray starts_buffer(starts); TArray steps_buffer(steps); TArray input_strides(gsl::narrow_cast(dimension_count)); - const gsl::span input_strides_span = gsl::make_span(input_strides.data_, input_strides.size_); + const gsl::span input_strides_span = gsl::make_span(input_strides.Data(), input_strides.Size()); if (p_flattened_output_dims != nullptr) { // we were able to flatten the innermost dimensions as they're being copied in full to the output. // do the same flattening to the innermost input dimensions in order to calculate pitches that match diff --git a/onnxruntime/core/providers/cuda/tensor/slice_impl.cu b/onnxruntime/core/providers/cuda/tensor/slice_impl.cu index 0c5b3eb514..df51104194 100644 --- a/onnxruntime/core/providers/cuda/tensor/slice_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/slice_impl.cu @@ -24,16 +24,16 @@ __global__ void _SliceKernel(const int32_t dimension_count, int value = id; int dim = 0; #pragma unroll - for (; dim < starts.GetCapacity(); ++dim) { + for (; dim < starts.Capacity(); ++dim) { if (dim >= dimension_count - 1) { break; } - output_strides.data_[dim].divmod(value, div, mod); - input_index += (starts.data_[dim] + div * steps.data_[dim]) * input_strides.data_[dim]; + output_strides[dim].divmod(value, div, mod); + input_index += (starts[dim] + div * steps[dim]) * input_strides[dim]; value = mod; } - input_index += starts.data_[dim] + mod * steps.data_[dim]; + input_index += starts[dim] + mod * steps[dim]; if (is_grad) output_data[input_index] = input_data[id]; else diff --git a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu index 1c776f252f..1a65fb2b6e 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu @@ -13,7 +13,7 @@ template __global__ void Transpose3DKernel(const TArray input_shape, const TArray input_strides, const T* input_data, T* output_data) { - __shared__ T tile[TILE_DIM * (TILE_DIM+1)]; + __shared__ T tile[TILE_DIM * (TILE_DIM + 1)]; int x = blockIdx.x * TILE_DIM + threadIdx.x; int y = blockIdx.y * TILE_DIM + threadIdx.y; @@ -32,9 +32,9 @@ bool CanDoTranspose3D(int32_t rank, const std::vector& permutations) { if (rank == 3 && // permutation is done in the last two dimensions. - permutations[rank-2] == (rank-1) && permutations[rank-1] == (rank-2) && + permutations[rank - 2] == (rank - 1) && permutations[rank - 1] == (rank - 2) && // the last two dimensions are aligned with TILE_DIM. - input_dims[rank-2] % TILE_DIM == 0 && input_dims[rank-1] % TILE_DIM == 0) { + input_dims[rank - 2] % TILE_DIM == 0 && input_dims[rank - 1] % TILE_DIM == 0) { return true; } return false; @@ -44,7 +44,7 @@ Status Transpose3DImpl(size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, void* output_data, int64_t N) { dim3 block_size(TILE_DIM, TILE_DIM); - dim3 grid_size(input_shape[2]/TILE_DIM, input_shape[1]/TILE_DIM, input_shape[0]); + dim3 grid_size(input_shape[2] / TILE_DIM, input_shape[1] / TILE_DIM, input_shape[0]); switch (element_size) { case sizeof(int8_t): @@ -73,7 +73,7 @@ Status Transpose3DImpl(size_t element_size, break; default: return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for transpose on CUDA. Element size was ", - element_size); + element_size); } return Status::OK(); @@ -86,13 +86,15 @@ __global__ void Transpose4DKernel(const TArray input_strides, const voi // output coordinates will be: blockIdx.y, blockIdx.x, threadIdx.y, threadIdx.x CUDA_LONG input_index = (blockIdx.y * input_strides[0] + blockIdx.x * input_strides[1] + - threadIdx.y * input_strides[2]) / (4 * sizeof(int) / element_size) + - threadIdx.x * input_strides[3]; + threadIdx.y * input_strides[2]) / + (4 * sizeof(int) / element_size) + + threadIdx.x * input_strides[3]; CUDA_LONG output_index = (blockIdx.y * output_strides[0] + blockIdx.x * output_strides[1] + - threadIdx.y * output_strides[2]) / (4 * sizeof(int) / element_size) + - threadIdx.x * output_strides[3]; + threadIdx.y * output_strides[2]) / + (4 * sizeof(int) / element_size) + + threadIdx.x * output_strides[3]; const int4* v_input = reinterpret_cast(input_data); int4* v_output = reinterpret_cast(output_data); @@ -109,12 +111,11 @@ bool CanDoTranspose4D(const cudaDeviceProp& prop, const std::vector& permutations) { if (rank == 4 && // the permutations is not on the last dimension. - permutations[rank-1] == (rank - 1)) { - + permutations[rank - 1] == (rank - 1)) { // The block size will be set based on the last two dimensions of 4D tensor. // the number threads per block will be calculated as below. - int num_elements_per_thread = 4 * sizeof(int) / element_size; // int4 is used in the kernel to access data. - int64_t num_elements_in_last_two_dimensions = input_dims[rank-2] * input_dims[rank-1]; + int num_elements_per_thread = 4 * sizeof(int) / element_size; // int4 is used in the kernel to access data. + int64_t num_elements_in_last_two_dimensions = input_dims[rank - 2] * input_dims[rank - 1]; int64_t num_threads_per_block = num_elements_in_last_two_dimensions / num_elements_per_thread; if (((num_elements_in_last_two_dimensions & (num_elements_per_thread - 1)) == 0) && @@ -130,34 +131,34 @@ bool CanDoTranspose4D(const cudaDeviceProp& prop, Status Transpose4DImpl(size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, const TArray& output_strides, void* output_data, int64_t N) { - int num_elements_per_thread = 4 * sizeof(int) / element_size; // int4 is used in the kernel to access data. - dim3 block_size(input_shape[3]/num_elements_per_thread, input_shape[2]); + int num_elements_per_thread = 4 * sizeof(int) / element_size; // int4 is used in the kernel to access data. + dim3 block_size(input_shape[3] / num_elements_per_thread, input_shape[2]); dim3 grid_size(input_shape[1], input_shape[0]); switch (element_size) { case sizeof(int8_t): Transpose4DKernel<<>>( input_strides, input_data, - output_strides, output_data, N/num_elements_per_thread); + output_strides, output_data, N / num_elements_per_thread); break; case sizeof(int16_t): Transpose4DKernel<<>>( input_strides, input_data, - output_strides, output_data, N/num_elements_per_thread); + output_strides, output_data, N / num_elements_per_thread); break; case sizeof(int32_t): Transpose4DKernel<<>>( input_strides, input_data, - output_strides, output_data, N/num_elements_per_thread); + output_strides, output_data, N / num_elements_per_thread); break; case sizeof(int64_t): Transpose4DKernel<<>>( input_strides, input_data, - output_strides, output_data, N/num_elements_per_thread); + output_strides, output_data, N / num_elements_per_thread); break; default: return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for transpose on CUDA. Element size was ", - element_size); + element_size); } return Status::OK(); @@ -170,8 +171,8 @@ __global__ void TransposeKernel(int32_t shape_rank, const TArray input_ CUDA_LONG input_index = 0; CUDA_LONG output_index = id; - #pragma unroll - for (auto dim = 0; dim < input_strides.GetCapacity(); ++dim) { +#pragma unroll + for (auto dim = 0; dim < input_strides.Capacity(); ++dim) { if (dim >= shape_rank) { break; } diff --git a/onnxruntime/core/providers/cuda/tensor/where.cc b/onnxruntime/core/providers/cuda/tensor/where.cc index 4be5578b28..2b765789f8 100644 --- a/onnxruntime/core/providers/cuda/tensor/where.cc +++ b/onnxruntime/core/providers/cuda/tensor/where.cc @@ -68,10 +68,10 @@ struct TernaryElementwisePreparation { const Tensor* a_tensor = nullptr; const Tensor* b_tensor = nullptr; const Tensor* c_tensor = nullptr; - size_t output_rank_or_simple_broadcast = 0; // for no_broadcast cases, output_rank uses SimpleBroadcast enums - TArray a_padded_strides; // for a shape == output shape, this is nullptr - TArray b_padded_strides; // for b shape == output shape, this is nullptr - TArray c_padded_strides; // for c shape == output shape, this is nullptr + size_t output_rank_or_simple_broadcast = 0; // for no_broadcast cases, output_rank uses SimpleBroadcast enums + TArray a_padded_strides; // for a shape == output shape, this is nullptr + TArray b_padded_strides; // for b shape == output shape, this is nullptr + TArray c_padded_strides; // for c shape == output shape, this is nullptr TArray fdm_output_strides; BroadcastIndexType a_index_type = BroadcastIndexType::NoBroadcast; BroadcastIndexType b_index_type = BroadcastIndexType::NoBroadcast; @@ -98,7 +98,7 @@ struct TernaryElementwisePreparation { output_rank_or_simple_broadcast = out_rank; auto padder = [out_rank](int32_t rank, const TensorShape& shape, TArray& padded_strides) { - padded_strides.size_ = out_rank; + padded_strides.SetSize(out_rank); if (rank > 0) { TensorPitches pitches(shape.GetDims()); auto offset = out_rank - rank; @@ -142,7 +142,7 @@ struct TernaryElementwisePreparation { } TensorPitches output_pitches(output_shape.GetDims()); - fdm_output_strides.size_ = out_rank; + fdm_output_strides.SetSize(out_rank); for (auto i = 0; i < out_rank; ++i) { fdm_output_strides[i] = fast_divmod(static_cast(output_pitches[i])); } diff --git a/onnxruntime/core/providers/cuda/tensor/where_impl.cu b/onnxruntime/core/providers/cuda/tensor/where_impl.cu index c4c4392007..319007c359 100644 --- a/onnxruntime/core/providers/cuda/tensor/where_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/where_impl.cu @@ -37,7 +37,7 @@ __global__ void _TenaryElementWise( CUDA_LONG y_index = (YIndexType == BroadcastIndexType::NoBroadcast ? id : 0); CUDA_LONG offset = id; #pragma unroll - for (auto dim = 0; dim < fdm_output_strides.GetCapacity(); dim++) { + for (auto dim = 0; dim < fdm_output_strides.Capacity(); dim++) { if (dim >= output_rank) { break; } @@ -111,73 +111,73 @@ __global__ void _TenaryElementWiseSimple( } } -#define HANDLE_Y_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE) \ - case Y_INDEX_TYPE: { \ - _TenaryElementWiseSimple \ - <<>>(cond_data, \ - x_data, \ - y_data, \ - output_data, \ - N); \ +#define HANDLE_Y_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE) \ + case Y_INDEX_TYPE: { \ + _TenaryElementWiseSimple \ + <<>>(cond_data, \ + x_data, \ + y_data, \ + output_data, \ + N); \ } break -#define HANDLE_X_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE_VAL) \ - case X_INDEX_TYPE: { \ - switch(Y_INDEX_TYPE_VAL) { \ - HANDLE_Y_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::NoBroadcast); \ - HANDLE_Y_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::Scalar); \ - } \ +#define HANDLE_X_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE_VAL) \ + case X_INDEX_TYPE: { \ + switch (Y_INDEX_TYPE_VAL) { \ + HANDLE_Y_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::NoBroadcast); \ + HANDLE_Y_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::Scalar); \ + } \ } break -#define HANDLE_COND_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE_VAL, Y_INDEX_TYPE_VAL) \ - case COND_INDEX_TYPE: { \ - switch(X_INDEX_TYPE_VAL) { \ - HANDLE_X_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, BroadcastIndexType::NoBroadcast, Y_INDEX_TYPE_VAL); \ - HANDLE_X_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, BroadcastIndexType::Scalar, Y_INDEX_TYPE_VAL); \ - } \ +#define HANDLE_COND_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE_VAL, Y_INDEX_TYPE_VAL) \ + case COND_INDEX_TYPE: { \ + switch (X_INDEX_TYPE_VAL) { \ + HANDLE_X_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, BroadcastIndexType::NoBroadcast, Y_INDEX_TYPE_VAL); \ + HANDLE_X_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, BroadcastIndexType::Scalar, Y_INDEX_TYPE_VAL); \ + } \ } break -#define HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE) \ - case Y_INDEX_TYPE: { \ - _TenaryElementWise \ - <<>>(output_rank_or_simple_broadcast, \ - cond_padded_strides, \ - cond_data, \ - x_padded_strides, \ - x_data, \ - y_padded_strides, \ - y_data, \ - fdm_output_strides, \ - output_data, \ - N); \ +#define HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE) \ + case Y_INDEX_TYPE: { \ + _TenaryElementWise \ + <<>>(output_rank_or_simple_broadcast, \ + cond_padded_strides, \ + cond_data, \ + x_padded_strides, \ + x_data, \ + y_padded_strides, \ + y_data, \ + fdm_output_strides, \ + output_data, \ + N); \ } break -#define HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE_VAL) \ - case X_INDEX_TYPE: { \ - switch(Y_INDEX_TYPE_VAL) { \ - HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::NoBroadcast); \ - HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::Scalar); \ - HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::NeedCompute); \ - } \ +#define HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE_VAL) \ + case X_INDEX_TYPE: { \ + switch (Y_INDEX_TYPE_VAL) { \ + HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::NoBroadcast); \ + HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::Scalar); \ + HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::NeedCompute); \ + } \ } break -#define HANDLE_COND_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE_VAL, Y_INDEX_TYPE_VAL) \ - case COND_INDEX_TYPE: { \ - switch(X_INDEX_TYPE_VAL) { \ - HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, BroadcastIndexType::NoBroadcast, Y_INDEX_TYPE_VAL); \ - HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, BroadcastIndexType::Scalar, Y_INDEX_TYPE_VAL); \ - HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, BroadcastIndexType::NeedCompute, Y_INDEX_TYPE_VAL); \ - } \ +#define HANDLE_COND_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE_VAL, Y_INDEX_TYPE_VAL) \ + case COND_INDEX_TYPE: { \ + switch (X_INDEX_TYPE_VAL) { \ + HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, BroadcastIndexType::NoBroadcast, Y_INDEX_TYPE_VAL); \ + HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, BroadcastIndexType::Scalar, Y_INDEX_TYPE_VAL); \ + HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, BroadcastIndexType::NeedCompute, Y_INDEX_TYPE_VAL); \ + } \ } break template @@ -198,12 +198,12 @@ void WhereImpl( int blocksPerGrid = static_cast(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); CUDA_LONG N = static_cast(count); if (output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::NoBroadcast)) { - switch(cond_index_type) { + switch (cond_index_type) { HANDLE_COND_INDEX_TYPE_SIMPLE(BroadcastIndexType::NoBroadcast, x_index_type, y_index_type); HANDLE_COND_INDEX_TYPE_SIMPLE(BroadcastIndexType::Scalar, x_index_type, y_index_type); } } else { - switch(cond_index_type) { + switch (cond_index_type) { HANDLE_COND_INDEX_TYPE(BroadcastIndexType::NoBroadcast, x_index_type, y_index_type); HANDLE_COND_INDEX_TYPE(BroadcastIndexType::Scalar, x_index_type, y_index_type); HANDLE_COND_INDEX_TYPE(BroadcastIndexType::NeedCompute, x_index_type, y_index_type); diff --git a/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu index f9746a8e64..527f396093 100644 --- a/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu @@ -177,19 +177,19 @@ __global__ void _DivGrad( CUDA_LONG a_index = (a_need_compute ? 0 : id); CUDA_LONG b_index = (b_need_compute ? 0 : id); CUDA_LONG offset = id; - #pragma unroll - for (auto dim = 0; dim < fdm_output_strides.GetCapacity(); dim++) { +#pragma unroll + for (auto dim = 0; dim < fdm_output_strides.Capacity(); dim++) { if (dim >= output_rank) { break; } int q, r; - fdm_output_strides.data_[dim].divmod(offset, q, r); + fdm_output_strides[dim].divmod(offset, q, r); if (a_need_compute) { - a_index += static_cast(a_padded_strides.data_[dim]) * q; + a_index += static_cast(a_padded_strides[dim]) * q; } if (b_need_compute) { - b_index += static_cast(b_padded_strides.data_[dim]) * q; + b_index += static_cast(b_padded_strides[dim]) * q; } offset = r; } @@ -209,15 +209,15 @@ __global__ void _DivGrad_A( CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); CUDA_LONG b_index = (b_need_compute ? 0 : id); CUDA_LONG offset = id; - #pragma unroll - for (auto dim = 0; dim < fdm_output_strides.GetCapacity(); dim++) { +#pragma unroll + for (auto dim = 0; dim < fdm_output_strides.Capacity(); dim++) { if (dim >= output_rank) { break; } int q, r; - fdm_output_strides.data_[dim].divmod(offset, q, r); + fdm_output_strides[dim].divmod(offset, q, r); if (b_need_compute) { - b_index += static_cast(b_padded_strides.data_[dim]) * q; + b_index += static_cast(b_padded_strides[dim]) * q; } offset = r; } @@ -239,19 +239,19 @@ __global__ void _DivGrad_B( CUDA_LONG a_index = (a_need_compute ? 0 : id); CUDA_LONG b_index = (b_need_compute ? 0 : id); CUDA_LONG offset = id; - #pragma unroll - for (auto dim = 0; dim < fdm_output_strides.GetCapacity(); dim++) { +#pragma unroll + for (auto dim = 0; dim < fdm_output_strides.Capacity(); dim++) { if (dim >= output_rank) { break; } int q, r; - fdm_output_strides.data_[dim].divmod(offset, q, r); + fdm_output_strides[dim].divmod(offset, q, r); if (a_need_compute) { - a_index += static_cast(a_padded_strides.data_[dim]) * q; + a_index += static_cast(a_padded_strides[dim]) * q; } if (b_need_compute) { - b_index += static_cast(b_padded_strides.data_[dim]) * q; + b_index += static_cast(b_padded_strides[dim]) * q; } offset = r; } @@ -441,7 +441,7 @@ void ImplDivGrad( T* db_output_data) { int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); - if (a_padded_strides && a_padded_strides->size_ && b_padded_strides && b_padded_strides->size_) { + if (a_padded_strides && a_padded_strides->Size() && b_padded_strides && b_padded_strides->Size()) { if (da_output_data && db_output_data) _DivGrad<<>>( output_rank, @@ -474,7 +474,7 @@ void ImplDivGrad( *fdm_output_strides, db_output_data, N); - } else if (a_padded_strides && a_padded_strides->size_) { + } else if (a_padded_strides && a_padded_strides->Size()) { if (da_output_data && db_output_data) _DivGrad<<>>( output_rank, @@ -543,42 +543,42 @@ void ImplDivGrad( } } // namespace cuda -#define SPECIALIZED_DIV_GRAD_IMPL(T) \ - template void ImplDivGrad( \ - int32_t output_rank, \ - const TArray* a_padded_strides, \ - const T* a_data, \ - const TArray* b_padded_strides, \ - const T* b_data, \ - const T* dy_data, \ - size_t count, \ - const TArray* fdm_output_strides,\ - T* da_output_data, \ - T* db_output_data); \ - template void ImplDivGradRhsPerChannelBatch1( \ - const T* a_data, \ - const T* b_data, \ - const T* dy_data, \ - size_t count, \ - const fast_divmod& fdm_H, \ - T* da_output_data, \ - T* db_output_data); \ - template void ImplDivGradRhsPerChannelBatchN( \ - const T* a_data, \ - const T* b_data, \ - const T* dy_data, \ - size_t count, \ - const fast_divmod& fdm_H, \ - const fast_divmod& fdm_C, \ - T* da_output_data, \ - T* db_output_data); \ - template void ImplDivGradSimple( \ - SimpleBroadcast simpleBroadcast, \ - const T* a_data, \ - const T* b_data, \ - const T* dy_data, \ - size_t count, \ - T* da_output_data, \ +#define SPECIALIZED_DIV_GRAD_IMPL(T) \ + template void ImplDivGrad( \ + int32_t output_rank, \ + const TArray* a_padded_strides, \ + const T* a_data, \ + const TArray* b_padded_strides, \ + const T* b_data, \ + const T* dy_data, \ + size_t count, \ + const TArray* fdm_output_strides, \ + T* da_output_data, \ + T* db_output_data); \ + template void ImplDivGradRhsPerChannelBatch1( \ + const T* a_data, \ + const T* b_data, \ + const T* dy_data, \ + size_t count, \ + const fast_divmod& fdm_H, \ + T* da_output_data, \ + T* db_output_data); \ + template void ImplDivGradRhsPerChannelBatchN( \ + const T* a_data, \ + const T* b_data, \ + const T* dy_data, \ + size_t count, \ + const fast_divmod& fdm_H, \ + const fast_divmod& fdm_C, \ + T* da_output_data, \ + T* db_output_data); \ + template void ImplDivGradSimple( \ + SimpleBroadcast simpleBroadcast, \ + const T* a_data, \ + const T* b_data, \ + const T* dy_data, \ + size_t count, \ + T* da_output_data, \ T* db_output_data); SPECIALIZED_DIV_GRAD_IMPL(half)