diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 7d2a7f2bea..d7e5e2b9e3 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -1198,9 +1198,9 @@ if (onnxruntime_USE_CUDA) endif() endif() endif() - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --default-stream legacy") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") if (NOT WIN32) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --expt-relaxed-constexpr --compiler-options -fPIC") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --compiler-options -fPIC") endif() # Options passed to cudafe set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=bad_friend_decl\"") diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h index 6069ed4839..a2454997bc 100644 --- a/include/onnxruntime/core/framework/execution_provider.h +++ b/include/onnxruntime/core/framework/execution_provider.h @@ -165,6 +165,9 @@ class IExecutionProvider { */ virtual common::Status OnSessionInitializationEnd() { return Status::OK(); } + virtual common::Status SetComputeStream(void*) { return Status::OK(); } + virtual void* GetComputeStream() const { return nullptr; } + void InsertAllocator(AllocatorPtr allocator); void ReplaceAllocator(AllocatorPtr allocator); // TODO: temparary sulotion, need to unify the interface in EP and AllocatorManager diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index 3b5c4af359..b0985608fc 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -266,8 +266,19 @@ typedef struct OrtCUDAProviderOptions { size_t cuda_mem_limit; // default cuda memory limitation to maximum finite value of size_t. int arena_extend_strategy; // default area extend strategy to KNextPowerOfTwo. int do_copy_in_default_stream; + int has_user_compute_stream; + void* user_compute_stream; } OrtCUDAProviderOptions; +/// +/// Options for the TensorRT provider that are passed to SessionOptionsAppendExecutionProvider_TensorRT +/// +typedef struct OrtTensorRTProviderOptions { + int device_id; + int has_user_compute_stream; + void* user_compute_stream; +} OrtTensorRTProviderOptions; + /// /// Options for the OpenVINO provider that are passed to SessionOptionsAppendExecutionProvider_OpenVINO /// @@ -1146,6 +1157,12 @@ struct OrtApi { */ ORT_API2_STATUS(ModelMetadataGetGraphDescription, _In_ const OrtModelMetadata* model_metadata, _Inout_ OrtAllocator* allocator, _Outptr_ char** value); + /** + * Append TensorRT execution provider to the session options + * If TensorRT is not available (due to a non TensorRT enabled build), this function will return failure. + */ + ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_TensorRT, + _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options); }; /* diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h index d5aa79a79d..be43d9cd21 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h @@ -326,6 +326,7 @@ struct SessionOptions : Base { SessionOptions& AppendExecutionProvider_CUDA(const OrtCUDAProviderOptions& provider_options); SessionOptions& AppendExecutionProvider_OpenVINO(const OrtOpenVINOProviderOptions& provider_options); + SessionOptions& AppendExecutionProvider_TensorRT(const OrtTensorRTProviderOptions& provider_options); }; struct ModelMetadata : Base { diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h index a5ce8219f6..a818c3c691 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h @@ -490,6 +490,11 @@ inline SessionOptions& SessionOptions::AppendExecutionProvider_CUDA(const OrtCUD return *this; } +inline SessionOptions& SessionOptions::AppendExecutionProvider_TensorRT(const OrtTensorRTProviderOptions& provider_options) { + ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_TensorRT(p_, &provider_options)); + return *this; +} + inline SessionOptions& SessionOptions::AppendExecutionProvider_OpenVINO(const OrtOpenVINOProviderOptions& provider_options) { ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_OpenVINO(p_, &provider_options)); return *this; diff --git a/onnxruntime/contrib_ops/cuda/activation/activations.cc b/onnxruntime/contrib_ops/cuda/activation/activations.cc index 45bda90b1e..6a26e0f6c3 100644 --- a/onnxruntime/contrib_ops/cuda/activation/activations.cc +++ b/onnxruntime/contrib_ops/cuda/activation/activations.cc @@ -29,6 +29,7 @@ namespace cuda { ORT_RETURN_IF_ERROR(UnaryElementwise::Prepare(context, &p)); \ Ctx##x func_ctx = MakeFuncCtx(); \ Impl_##x::MappedType>( \ + Stream(), \ reinterpret_cast::MappedType*>(p.input_tensor->template Data()), \ reinterpret_cast::MappedType*>(p.output_tensor->template MutableData()), \ &func_ctx, p.output_tensor->Shape().Size()); \ diff --git a/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu b/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu index 62601a1c69..7988ecd42f 100644 --- a/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu +++ b/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu @@ -45,14 +45,15 @@ struct OP_Gelu : public CtxGelu { #define UNARY_ACTIVATION_IMPL(name) \ UNARY_ACTIVATION_IMPL_DECLARATION(name) { \ - UnaryElementWiseImpl(input_data, \ + UnaryElementWiseImpl(stream, \ + input_data, \ output_data, \ *reinterpret_cast*>(func_ctx), \ count); \ } #define SPECIALIZED_UNARY_ACTIVATION_IMPL(name, T) \ - template void Impl_##name(const T* input_data, T* output_data, const Ctx##name* func_ctx, size_t count); + template void Impl_##name(cudaStream_t stream, const T* input_data, T* output_data, const Ctx##name* func_ctx, size_t count); #define SPECIALIZED_UNARY_ACTIVATIONL_HFD(name) \ SPECIALIZED_UNARY_ACTIVATION_IMPL(name, half) \ diff --git a/onnxruntime/contrib_ops/cuda/activation/activations_impl.h b/onnxruntime/contrib_ops/cuda/activation/activations_impl.h index 95ea6d5af6..56ece01e46 100644 --- a/onnxruntime/contrib_ops/cuda/activation/activations_impl.h +++ b/onnxruntime/contrib_ops/cuda/activation/activations_impl.h @@ -22,6 +22,7 @@ typedef onnxruntime::cuda::CtxNull CtxGelu; #define UNARY_ACTIVATION_IMPL_DECLARATION(name) \ template \ void Impl_##name( \ + cudaStream_t stream, \ const T* input_data, \ T* output_data, \ const Ctx##name* func_ctx, \ diff --git a/onnxruntime/contrib_ops/cuda/bert/attention.cc b/onnxruntime/contrib_ops/cuda/bert/attention.cc index 25a23a5111..ce9147ad1b 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/attention.cc @@ -88,6 +88,7 @@ Status Attention::ComputeInternal(OpKernelContext* context) const { auto temp_buffer = GetScratchBuffer(workSpaceSize); if (!LaunchAttentionKernel( device_prop, + Stream(), reinterpret_cast(gemm_buffer.get()), nullptr == mask_index ? nullptr : mask_index->template Data(), nullptr == mask_index ? nullptr : &(mask_index->Shape().GetDims()), diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu index 00f92b4f1c..a342168c6d 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu @@ -148,6 +148,7 @@ bool QkvToContext( bool LaunchAttentionKernel( const cudaDeviceProp& prop, + cudaStream_t stream, const void* input, const int* mask_index, const std::vector* mask_index_dims, @@ -163,9 +164,6 @@ bool LaunchAttentionKernel( int past_sequence_length, const void* past, void* present) { - // use default stream - const cudaStream_t stream = nullptr; - if (element_size == 2) { return QkvToContext(prop, cublas, stream, batch_size, sequence_length, num_heads, head_size, element_size, diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/attention_impl.h index c51c007290..30f03b8668 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.h +++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.h @@ -20,6 +20,7 @@ size_t GetAttentionWorkspaceSize( bool LaunchAttentionKernel( const cudaDeviceProp& prop, // Device Properties + cudaStream_t stream, // cuda stream const void* input, // Input tensor const int* mask_index, // Attention mask raw data or index (end position of each sequence, or end positions and start positions). NULL means no mask. const std::vector* mask_index_dims, // Mask index shape diff --git a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm.cc b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm.cc index 8adffa85ed..e975181d29 100644 --- a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm.cc +++ b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm.cc @@ -61,6 +61,7 @@ Status EmbedLayerNorm::ComputeInternal(OpKernelContext* context) const { size_t element_size = sizeof(T); if (!LaunchEmbedLayerNormKernel( + Stream(), output->template MutableData(), mask_index->template MutableData(), input_ids->template Data(), diff --git a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu index 9e856e2e35..ad005e40e0 100644 --- a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu @@ -173,6 +173,7 @@ bool EmbedSkipLayerNorm( } bool LaunchEmbedLayerNormKernel( + cudaStream_t stream, void* output, void* mask_index, const int* input_ids, @@ -188,10 +189,8 @@ bool LaunchEmbedLayerNormKernel( int batch_size, int sequence_length, const size_t element_size) { - const cudaStream_t stream = nullptr; // default stream - if (nullptr == input_mask) { - if (!CUDA_CALL(cudaMemsetAsync(mask_index, 0, sizeof(int) * batch_size))) + if (!CUDA_CALL(cudaMemsetAsync(mask_index, 0, sizeof(int) * batch_size, stream))) return false; } else if (!ComputeMaskIndex(stream, sequence_length, batch_size, input_mask, static_cast(mask_index))) { return false; diff --git a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.h b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.h index 18648e6799..6977fd3e8e 100644 --- a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.h +++ b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.h @@ -6,7 +6,8 @@ namespace onnxruntime { namespace contrib { namespace cuda { -bool LaunchEmbedLayerNormKernel(void* output, // output tensor +bool LaunchEmbedLayerNormKernel(cudaStream_t stream, + void* output, // output tensor void* mask_index, // output mask index const int* input_ids, // input word IDs const int* segment_ids, // input segment IDs diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc index 642ef3458c..8e4bfb1c84 100644 --- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc +++ b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc @@ -47,7 +47,7 @@ Status FastGelu::ComputeInternal(OpKernelContext* context) const { int64_t bias_length = (nullptr == bias) ? 0 : bias->Shape().Size(); typedef typename ToCudaType::MappedType CudaT; if (!LaunchFastGeluKernel(GetDeviceProp(), - nullptr, + Stream(), static_cast(input_length), static_cast(bias_length), reinterpret_cast(input->template Data()), diff --git a/onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc b/onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc index ef2eecb1ec..9ec5298c2b 100644 --- a/onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc @@ -111,6 +111,7 @@ Status LongformerAttention::ComputeInternal(OpKernelContext* context) const { auto workspace_buffer = GetScratchBuffer(workSpaceSize); if (!LaunchLongformerAttentionKernel( device_prop, + Stream(), reinterpret_cast(gemm_buffer.get()), reinterpret_cast(mask->template Data()), reinterpret_cast(global_gemm_buffer.get()), diff --git a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu index fd9637dfc9..191a979fc9 100644 --- a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu @@ -814,6 +814,7 @@ bool LongformerQkvToContext( bool LaunchLongformerAttentionKernel( const cudaDeviceProp& prop, + cudaStream_t stream, const void* input, const void* attention_mask, const void* global_input, @@ -828,9 +829,6 @@ bool LaunchLongformerAttentionKernel( void* workspace, cublasHandle_t& cublas, const size_t element_size) { - // use default stream - const cudaStream_t stream = nullptr; - if (element_size == 2) { return LongformerQkvToContext(prop, cublas, stream, batch_size, sequence_length, num_heads, head_size, window, element_size, diff --git a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.h index 632f6d6e5c..c08461e800 100644 --- a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.h +++ b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.h @@ -18,6 +18,7 @@ size_t GetLongformerAttentionWorkspaceSize( bool LaunchLongformerAttentionKernel( const cudaDeviceProp& device_prop, // Device Properties + cudaStream_t stream, // CUDA stream const void* input, // Input tensor const void* attention_mask, // Attention mask with shape (B, S) const void* global_input, // Global attention input, or nullptr when max_num_global == 0. diff --git a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc index f8f6c2ad49..b8238f7690 100644 --- a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc @@ -93,6 +93,7 @@ Status SkipLayerNorm::ComputeInternal(OpKernelContext* ctx) const { size_t element_size = sizeof(T); if (!LaunchSkipLayerNormKernel( + Stream(), output->template MutableData(), input->template Data(), skip->template Data(), diff --git a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu index 9c11ff85e0..a7b6aabe52 100644 --- a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu @@ -100,6 +100,7 @@ bool ComputeSkipLayerNorm( } bool LaunchSkipLayerNormKernel( + cudaStream_t stream, void* output, const void* input, const void* skip, @@ -110,9 +111,6 @@ bool LaunchSkipLayerNormKernel( int hidden_size, int element_count, size_t element_size) { - // use default stream - const cudaStream_t stream = nullptr; - if (element_size == 2) { return ComputeSkipLayerNorm( stream, diff --git a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.h b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.h index 308242c010..0148231f2b 100644 --- a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.h +++ b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.h @@ -8,6 +8,7 @@ namespace contrib { namespace cuda { bool LaunchSkipLayerNormKernel( + cudaStream_t stream, void* output, // output tensor const void* input, // input tensor const void* skip, // skip tensor diff --git a/onnxruntime/contrib_ops/cuda/fused_conv.cc b/onnxruntime/contrib_ops/cuda/fused_conv.cc index 0e24032c48..6cce365871 100644 --- a/onnxruntime/contrib_ops/cuda/fused_conv.cc +++ b/onnxruntime/contrib_ops/cuda/fused_conv.cc @@ -90,7 +90,7 @@ class FusedConv : public onnxruntime::cuda::Conv { Base::s_.y_data, beta, Base::s_.y_tensor, Base::s_.y_data)); } if (Base::s_.post_slicing_required) { - onnxruntime::cuda::SliceOutUnwantedOutputSection(Base::s_.y_data, Base::s_.y_dims_with_adjusted_pads, Base::s_.Y->MutableDataRaw(), + onnxruntime::cuda::SliceOutUnwantedOutputSection(this->Stream(), Base::s_.y_data, Base::s_.y_dims_with_adjusted_pads, Base::s_.Y->MutableDataRaw(), Base::s_.y_dims, Base::s_.slice_starts, Base::s_.slice_ends, Base::s_.slice_axes, Base::s_.element_size); } return Status::OK(); diff --git a/onnxruntime/contrib_ops/cuda/inverse.cc b/onnxruntime/contrib_ops/cuda/inverse.cc index 546fc105de..f762b09d9f 100644 --- a/onnxruntime/contrib_ops/cuda/inverse.cc +++ b/onnxruntime/contrib_ops/cuda/inverse.cc @@ -35,22 +35,24 @@ ONNX_OPERATOR_KERNEL_EX( namespace inverse_internal { template -Status ComputeMatrixOffsets(T* workspace_data, size_t num_batches, size_t rows, IAllocatorUniquePtr& matrix_ptrs) { +Status ComputeMatrixOffsets(cudaStream_t stream, T* workspace_data, size_t num_batches, size_t rows, IAllocatorUniquePtr& matrix_ptrs) { std::vector cuda_ptrs; const size_t matrix_size = rows * rows; for (size_t i = 0; i < num_batches; ++i) { cuda_ptrs.push_back(workspace_data); workspace_data += matrix_size; } - CUDA_RETURN_IF_ERROR(cudaMemcpy(matrix_ptrs.get(), cuda_ptrs.data(), sizeof(T*) * num_batches, - cudaMemcpyHostToDevice)); + + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(matrix_ptrs.get(), cuda_ptrs.data(), sizeof(T*) * num_batches, + cudaMemcpyHostToDevice, stream)); return Status::OK(); } -Status CheckForSingularity(const IAllocatorUniquePtr& info, const std::unique_ptr& info_cpu, size_t num_batches) { +Status CheckForSingularity(cudaStream_t stream, const IAllocatorUniquePtr& info, const std::unique_ptr& info_cpu, size_t num_batches) { // Let's check if any of the info values is non-zero - CUDA_RETURN_IF_ERROR(cudaMemcpy(info_cpu.get(), info.get(), sizeof(int) * num_batches, - cudaMemcpyDeviceToHost)); + // cudaMemcpyAsync from device memory to pageable host memory will return only once the copy has completed. + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(info_cpu.get(), info.get(), sizeof(int) * num_batches, + cudaMemcpyDeviceToHost, stream)); for (size_t i = 0; i < num_batches; ++i) { if (info_cpu[i] != 0) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Matrix is singular at batch:", i); @@ -63,7 +65,7 @@ Status CheckForSingularity(const IAllocatorUniquePtr& info, const std::uniq template struct Inverse::ComputeImpl { - Status operator()(Inverse::CublasHandle cublas_h, const Inverse* inst, const Tensor& input, Tensor& output, + Status operator()(cudaStream_t stream, Inverse::CublasHandle cublas_h, const Inverse* inst, const Tensor& input, Tensor& output, const IAllocatorUniquePtr& info, const IAllocatorUniquePtr& pivots, size_t num_batches, size_t rows) const { using namespace onnxruntime::cuda; @@ -79,52 +81,52 @@ struct Inverse::ComputeImpl { IAllocatorUniquePtr input_workspace = inst->GetScratchBuffer(input_count); if (std::is_same::value) { // Convert from MLFloat16(half) to float - Impl_Cast(reinterpret_cast(input.Data()), input_workspace.get(), input_count); + Impl_Cast(stream, reinterpret_cast(input.Data()), input_workspace.get(), input_count); } else { - CUDA_RETURN_IF_ERROR(cudaMemcpy(input_workspace.get(), input.Data(), sizeof(float) * input_count, - cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_workspace.get(), input.Data(), sizeof(float) * input_count, + cudaMemcpyDeviceToDevice, stream)); } IAllocatorUniquePtr matrix_ptrs = inst->GetScratchBuffer(n_batches); - ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(input_workspace.get(), num_batches, rows, matrix_ptrs)); + ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(stream, input_workspace.get(), num_batches, rows, matrix_ptrs)); // Do LU factorization CUBLAS_RETURN_IF_ERROR(cublasSgetrfBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), info.get(), n_batches)); - ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches)); + ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches)); // Need to compute ptrs for output buffers // Output for MLFloat IAllocatorUniquePtr output_ptrs = inst->GetScratchBuffer(n_batches); if (std::is_same::value) { IAllocatorUniquePtr ml_float_output = inst->GetScratchBuffer(input_count); - ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(ml_float_output.get(), num_batches, rows, output_ptrs)); + ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(stream, ml_float_output.get(), num_batches, rows, output_ptrs)); // Do the inverse CUBLAS_RETURN_IF_ERROR(cublasSgetriBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), output_ptrs.get(), dim, info.get(), n_batches)); - ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches)); + ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches)); // Copy the result to output with casting - Impl_Cast(ml_float_output.get(), reinterpret_cast(output.MutableData()), input_count); + Impl_Cast(stream, ml_float_output.get(), reinterpret_cast(output.MutableData()), input_count); // We are done here } else { - ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(output.MutableData(), num_batches, rows, output_ptrs)); + ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(stream, output.MutableData(), num_batches, rows, output_ptrs)); // Do the inverse CUBLAS_RETURN_IF_ERROR(cublasSgetriBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), output_ptrs.get(), dim, info.get(), n_batches)); - ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches)); + ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches)); // We are done here } } else if (std::is_same::value) { IAllocatorUniquePtr input_workspace = inst->GetScratchBuffer(static_cast(input_count)); - CUDA_RETURN_IF_ERROR(cudaMemcpy(input_workspace.get(), input.Data(), sizeof(double) * input_count, - cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_workspace.get(), input.Data(), sizeof(double) * input_count, + cudaMemcpyDeviceToDevice, stream)); IAllocatorUniquePtr matrix_ptrs = inst->GetScratchBuffer(n_batches); - ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(input_workspace.get(), num_batches, rows, matrix_ptrs)); + ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(stream, input_workspace.get(), num_batches, rows, matrix_ptrs)); // Do LU factorization CUBLAS_RETURN_IF_ERROR(cublasDgetrfBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), info.get(), n_batches)); - ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches)); + ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches)); // Need to compute ptrs for output buffers IAllocatorUniquePtr output_ptrs = inst->GetScratchBuffer(n_batches); - ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(output.MutableData(), num_batches, rows, output_ptrs)); + ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(stream, output.MutableData(), num_batches, rows, output_ptrs)); CUBLAS_RETURN_IF_ERROR(cublasDgetriBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), output_ptrs.get(), dim, info.get(), n_batches)); - ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches)); + ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches)); // We are done here } else { ORT_THROW("Type is not supported"); @@ -148,11 +150,11 @@ Status Inverse::ComputeInternal(OpKernelContext* ctx) const { } IAllocatorUniquePtr info = GetScratchBuffer(num_batches); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(info.get(), 0, num_batches)); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(info.get(), 0, num_batches, Stream())); IAllocatorUniquePtr pivots = GetScratchBuffer(rows * num_batches); utils::MLTypeCallDispatcherRet t_disp(input->GetElementType()); - return t_disp.Invoke(Base::CublasHandle(), this, *input, *output, info, pivots, num_batches, rows); + return t_disp.Invoke(Stream(), Base::CublasHandle(), this, *input, *output, info, pivots, num_batches, rows); } } // namespace cuda diff --git a/onnxruntime/contrib_ops/cuda/layer_norm.cc b/onnxruntime/contrib_ops/cuda/layer_norm.cc index 3a864bc7b7..12f37f36a0 100644 --- a/onnxruntime/contrib_ops/cuda/layer_norm.cc +++ b/onnxruntime/contrib_ops/cuda/layer_norm.cc @@ -98,7 +98,7 @@ Status LayerNorm::ComputeInternal(OpKernelContext* ctx) const inv_var_data = reinterpret_cast(var->template MutableData()); } - HostApplyLayerNorm(GetDeviceProp(), Y_data, mean_data, inv_var_data, X_data, n1, n2, epsilon_, scale_data, bias_data); + HostApplyLayerNorm(GetDeviceProp(), Stream(), Y_data, mean_data, inv_var_data, X_data, n1, n2, epsilon_, scale_data, bias_data); return Status::OK(); } diff --git a/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu b/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu index 0d2d6fd2e2..46e8fa2900 100644 --- a/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu +++ b/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu @@ -350,6 +350,7 @@ __global__ void cuApplyLayerNorm( template void HostApplyLayerNorm( const cudaDeviceProp& prop, + cudaStream_t stream, T* output, U* mean, U* invvar, @@ -367,7 +368,7 @@ void HostApplyLayerNorm( const dim3 blocks(1, std::min(n1, maxGridY), 1); int nshared = threads.y > 1 ? threads.y * sizeof(U) + (threads.y / 2) * sizeof(U) : 0; - cuApplyLayerNorm<<>>( + cuApplyLayerNorm<<>>( output, mean, invvar, @@ -378,7 +379,7 @@ void HostApplyLayerNorm( } #define LAYERNORM_LINEAR_IMPL(T, U, simplified) \ - template void HostApplyLayerNorm(const cudaDeviceProp& prop, T* output, U* mean, U* invvar, const T* input, int n1, int n2, \ + template void HostApplyLayerNorm(const cudaDeviceProp& prop, cudaStream_t stream, T* output, U* mean, U* invvar, const T* input, int n1, int n2, \ double epsilon, const T* gamma, const T* beta); LAYERNORM_LINEAR_IMPL(float, float, true) diff --git a/onnxruntime/contrib_ops/cuda/layer_norm_impl.h b/onnxruntime/contrib_ops/cuda/layer_norm_impl.h index 039b7700a6..1705d99915 100644 --- a/onnxruntime/contrib_ops/cuda/layer_norm_impl.h +++ b/onnxruntime/contrib_ops/cuda/layer_norm_impl.h @@ -32,6 +32,7 @@ namespace cuda { template void HostApplyLayerNorm( const cudaDeviceProp& prop, + cudaStream_t stream, T* output, U* mean, U* invvar, diff --git a/onnxruntime/contrib_ops/cuda/math/bias_softmax.cc b/onnxruntime/contrib_ops/cuda/math/bias_softmax.cc index 71d8679319..d9d30055dd 100644 --- a/onnxruntime/contrib_ops/cuda/math/bias_softmax.cc +++ b/onnxruntime/contrib_ops/cuda/math/bias_softmax.cc @@ -15,6 +15,7 @@ namespace cuda { template void DispatchBiasSoftmaxForwardImpl( + cudaStream_t stream, Tensor* output_tensor, const Tensor* input_tensor, const Tensor* input_bias_tensor, @@ -25,6 +26,7 @@ void DispatchBiasSoftmaxForwardImpl( template void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( + cudaStream_t stream, cudnnHandle_t cudaDnnHandle, int element_count, int batch_count, @@ -64,12 +66,12 @@ Status BiasSoftmax::ComputeInternal(OpKernelContext* ctx) const { // expect thread blocks can fill SM at high occupancy without overflowing registers utils::MLTypeCallDispatcher t_disp(X->GetElementType()); - t_disp.Invoke(Y, X, B, D, N, D, broadcast_size); + t_disp.Invoke(Stream(), Y, X, B, D, N, D, broadcast_size); } else { // need to fallback to add kernel + CUDA DNN library softmax call :/ utils::MLTypeCallDispatcher t_disp(X->GetElementType()); - t_disp.Invoke(CudnnHandle(), D, N, broadcast_axis, softmax_axis, X_shape, X, B_shape, B, Y); + t_disp.Invoke(Stream(), CudnnHandle(), D, N, broadcast_axis, softmax_axis, X_shape, X, B_shape, B, Y); } return Status::OK(); @@ -77,6 +79,7 @@ Status BiasSoftmax::ComputeInternal(OpKernelContext* ctx) const { template void DispatchBiasSoftmaxForward::operator()( + cudaStream_t stream, Tensor* output, const Tensor* input, const Tensor* input_bias, @@ -85,6 +88,7 @@ void DispatchBiasSoftmaxForward::operator()( int batch_stride, int bias_broadcast_size_per_batch) { DispatchBiasSoftmaxForwardImpl( + stream, output, input, input_bias, @@ -96,6 +100,7 @@ void DispatchBiasSoftmaxForward::operator()( template void DispatchBiasSoftMaxForwardViaDnnLibrary::operator()( + cudaStream_t stream, cudnnHandle_t cudaDnnHandle, int element_count, int batch_count, @@ -107,6 +112,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibrary::operator()( const onnxruntime::Tensor* B, onnxruntime::Tensor* Y) { DispatchBiasSoftMaxForwardViaDnnLibraryImpl( + stream, cudaDnnHandle, element_count, batch_count, diff --git a/onnxruntime/contrib_ops/cuda/math/bias_softmax.h b/onnxruntime/contrib_ops/cuda/math/bias_softmax.h index 5bbc7266a3..03baec8d35 100644 --- a/onnxruntime/contrib_ops/cuda/math/bias_softmax.h +++ b/onnxruntime/contrib_ops/cuda/math/bias_softmax.h @@ -13,6 +13,7 @@ namespace cuda { template struct DispatchBiasSoftmaxForward { void operator()( + cudaStream_t stream, Tensor* output, const Tensor* input, const Tensor* input_bias, @@ -25,6 +26,7 @@ struct DispatchBiasSoftmaxForward { template struct DispatchBiasSoftMaxForwardViaDnnLibrary { void operator()( + cudaStream_t stream, cudnnHandle_t cudaDnnHandle, int element_count, int batch_count, diff --git a/onnxruntime/contrib_ops/cuda/math/bias_softmax_impl.cu b/onnxruntime/contrib_ops/cuda/math/bias_softmax_impl.cu index 959a2d191c..27b2363219 100644 --- a/onnxruntime/contrib_ops/cuda/math/bias_softmax_impl.cu +++ b/onnxruntime/contrib_ops/cuda/math/bias_softmax_impl.cu @@ -127,6 +127,7 @@ __global__ void BiasSoftmaxWarpForward( template void DispatchBiasSoftmaxForwardImpl( + cudaStream_t stream, Tensor* output_tensor, const Tensor* input_tensor, const Tensor* input_bias_tensor, @@ -167,47 +168,47 @@ void DispatchBiasSoftmaxForwardImpl( switch (log2_elements) { case 0: // 1 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 1: // 2 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 2: // 4 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 3: // 8 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 4: // 16 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 5: // 32 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 6: // 64 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 7: // 128 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 8: // 256 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 9: // 512 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 10: // 1024 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; default: break; @@ -216,6 +217,7 @@ void DispatchBiasSoftmaxForwardImpl( #define SPECIALIZED_BIAS_SOFTMAX_IMPL(T) \ template void DispatchBiasSoftmaxForwardImpl( \ + cudaStream_t stream, \ Tensor * output_tensor, \ const Tensor* input_tensor, \ const Tensor* input_bias_tensor, \ @@ -232,6 +234,7 @@ SPECIALIZED_BIAS_SOFTMAX_IMPL(MLFloat16) // note: This is an unhappy path! There is no performance benefit for the fusion. template void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( + cudaStream_t stream, cudnnHandle_t cudaDnnHandle, int element_count, int batch_count, @@ -278,6 +281,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( // invoke elementwise add with broadcast kernel ::onnxruntime::cuda::BinaryElementWiseImpl( + stream, (int32_t)X_shape.NumDimensions(), &lhs_padded_strides, X_data, @@ -311,6 +315,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( #define SPECIALIZED_BIAS_SOFTMAX_IMPL_VIA_DNN(T) \ template void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( \ + cudaStream_t stream, \ cudnnHandle_t cudaDnnHandle, \ int element_count, \ int batch_count, \ diff --git a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.cc b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.cc index a96e576b7d..5f85223a6b 100644 --- a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.cc +++ b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.cc @@ -25,6 +25,7 @@ namespace cuda { BinaryElementwisePreparation prepare; \ ORT_RETURN_IF_ERROR(Prepare(context, &prepare)); \ Impl_##x::MappedType>( \ + Stream(), \ prepare.output_rank_or_simple_broadcast, \ &prepare.lhs_padded_strides, \ reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), \ diff --git a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.cu b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.cu index c6b977ddbe..01791ed94c 100644 --- a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.cu +++ b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.cu @@ -20,7 +20,8 @@ namespace cuda { #define CONTRIB_BINARY_ELEMENTWISE_IMPL(name) \ CONTRIB_BINARY_ELEMENTWISE_IMPL_DECLARATION(name) { \ - BinaryElementWiseImpl(output_rank_or_simple_broadcast, \ + BinaryElementWiseImpl(stream, \ + output_rank_or_simple_broadcast, \ lhs_padded_strides, \ lhs_data, \ rhs_padded_strides, \ @@ -34,7 +35,8 @@ namespace cuda { } #define CONTRIB_SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, T) \ - template void Impl_##x(int32_t output_rank, \ + template void Impl_##x(cudaStream_t stream, \ + int32_t output_rank, \ const TArray* lhs_padded_strides, \ const T* lhs_data, \ const TArray* rhs_padded_strides, \ diff --git a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.h b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.h index bb2af2f55a..6ff4233278 100644 --- a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.h +++ b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.h @@ -20,6 +20,7 @@ namespace cuda { #define CONTRIB_BINARY_ELEMENTWISE_IMPL_DECLARATION(name) \ template \ void Impl_##name( \ + cudaStream_t stream, \ int32_t output_rank_or_simple_broadcast, \ const TArray* lhs_padded_strides, \ const T* lhs_data, \ diff --git a/onnxruntime/contrib_ops/cuda/math/complex_mul.cc b/onnxruntime/contrib_ops/cuda/math/complex_mul.cc index 70d286ae0d..9584e8de3c 100644 --- a/onnxruntime/contrib_ops/cuda/math/complex_mul.cc +++ b/onnxruntime/contrib_ops/cuda/math/complex_mul.cc @@ -42,6 +42,7 @@ Status ComplexMul::ComputeInternal(OpKernelContext* context) const { BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(Prepare(context, &prepare)); ComplexMul_Impl::MappedType>( + Stream(), prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), diff --git a/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu b/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu index 0004cf9433..fdbc986b89 100644 --- a/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu +++ b/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu @@ -90,6 +90,7 @@ __global__ void _ElementWiseWithStrideTwo( template void ComplexMul_Impl( + cudaStream_t stream, int32_t output_rank_or_simple_broadcast, const TArray* lhs_padded_strides, const T* lhs_data, @@ -110,7 +111,7 @@ void ComplexMul_Impl( CUDA_LONG N = static_cast(count); if (lhs_padded_strides && rhs_padded_strides && lhs_padded_strides->Size() && rhs_padded_strides->Size()) - _ElementWiseWithStrideTwo<<>>( + _ElementWiseWithStrideTwo<<>>( output_rank_or_simple_broadcast, *lhs_padded_strides, lhs_data, @@ -123,7 +124,7 @@ void ComplexMul_Impl( rhs_size, is_conj); else if (lhs_padded_strides && lhs_padded_strides->Size()) - _ElementWiseWithStrideTwo<<>>( + _ElementWiseWithStrideTwo<<>>( output_rank_or_simple_broadcast, *lhs_padded_strides, lhs_data, @@ -136,7 +137,7 @@ void ComplexMul_Impl( rhs_size, is_conj); else - _ElementWiseWithStrideTwo<<>>( + _ElementWiseWithStrideTwo<<>>( output_rank_or_simple_broadcast, *lhs_padded_strides, lhs_data, @@ -152,6 +153,7 @@ void ComplexMul_Impl( #define SPECIALIZE_STACKEDCOMPLEXMUL_IMPL(T) \ template void ComplexMul_Impl( \ + cudaStream_t stream, \ int32_t output_rank_or_simple_broadcast, \ const TArray* lhs_padded_strides, \ const T* lhs_data, \ diff --git a/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.h b/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.h index d48eea9a9f..dae66d8325 100644 --- a/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.h +++ b/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.h @@ -13,6 +13,7 @@ using namespace ::onnxruntime::cuda; template void ComplexMul_Impl( + cudaStream_t stream, int32_t output_rank_or_simple_broadcast, const TArray* lhs_padded_strides, const T* lhs_data, diff --git a/onnxruntime/contrib_ops/cuda/math/fft_ops.cc b/onnxruntime/contrib_ops/cuda/math/fft_ops.cc index 3c60644d70..c685882e92 100644 --- a/onnxruntime/contrib_ops/cuda/math/fft_ops.cc +++ b/onnxruntime/contrib_ops/cuda/math/fft_ops.cc @@ -127,11 +127,11 @@ Status FFTBase::DoFFT(OpKernelContext* context, const Tensor* X, bool complex Tensor* Y = const_cast(context)->Output(0, TensorShape(output_dims)); auto* x_data = reinterpret_cast(X->template Data()); auto* y_data = reinterpret_cast(Y->template MutableData()); - + CUFFT_RETURN_IF_ERROR(cufftSetStream(plan_info.plan, Stream())); CUFFT_RETURN_IF_ERROR(cufftXtExec(plan_info.plan, const_cast(x_data), y_data, inverse ? CUFFT_INVERSE : CUFFT_FORWARD)); if (inverse) { - PostProcess(signal_dims, output_size, y_data); + PostProcess(Stream(), signal_dims, output_size, y_data); } return Status::OK(); diff --git a/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu b/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu index 20d6272628..c1f4a088e0 100644 --- a/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu +++ b/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu @@ -27,14 +27,14 @@ __global__ void _Normalize( } template -void PostProcess(const std::vector& signal_dims, int64_t N, T* output_data) { +void PostProcess(cudaStream_t stream, const std::vector& signal_dims, int64_t N, T* output_data) { int64_t scale = std::accumulate(signal_dims.begin(), signal_dims.end(), 1ll, std::multiplies()); int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _Normalize<<>>(output_data, N, static_cast(scale)); + _Normalize<<>>(output_data, N, static_cast(scale)); } #define SPECIALIZED_IMPL(T) \ - template void PostProcess(const std::vector& signal_dims, int64_t N, T* output_data); + template void PostProcess(cudaStream_t stream, const std::vector& signal_dims, int64_t N, T* output_data); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.h b/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.h index 8a7f7789c0..2312acd5d3 100644 --- a/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.h +++ b/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.h @@ -12,7 +12,7 @@ namespace contrib { namespace cuda { template -void PostProcess(const std::vector& signal_dims, int64_t N, T* output_data); +void PostProcess(cudaStream_t stream, const std::vector& signal_dims, int64_t N, T* output_data); } // namespace cuda } // namespace contrib diff --git a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc index 67d51b53d5..5833e2fcee 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc +++ b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc @@ -158,6 +158,7 @@ Status QAttention::ComputeInternal(OpKernelContext* context) const { } // scale back and bias CudaDequantizeWithBias( + Stream(), gemm_buffer_quantized.get(), reinterpret_cast(bias->template Data()), reinterpret_cast(gemm_buffer.get()), @@ -172,6 +173,7 @@ Status QAttention::ComputeInternal(OpKernelContext* context) const { auto temp_buffer = GetScratchBuffer(workSpaceSize); if (!LaunchAttentionKernel( GetDeviceProp(), + Stream(), reinterpret_cast(gemm_buffer.get()), nullptr == mask_index ? nullptr : mask_index->template Data(), nullptr == mask_index ? nullptr : &(mask_index->Shape().GetDims()), diff --git a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization_impl.cu b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization_impl.cu index 42791ae795..168c8a6f42 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization_impl.cu +++ b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization_impl.cu @@ -31,10 +31,10 @@ __global__ void DequantizeLinearKernel(const int32_t* quantize, const T* bias, T } template -Status CudaDequantizeWithBias(const int32_t* quantize, const T* bias, T* output, T scale, int m, int n) { +Status CudaDequantizeWithBias(cudaStream_t stream, const int32_t* quantize, const T* bias, T* output, T scale, int m, int n) { int blocksPerGrid = static_cast(CeilDiv(m * n, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); CUDA_LONG N = static_cast(m * n); - DequantizeLinearKernel<<>>( + DequantizeLinearKernel<<>>( quantize, bias, output, @@ -44,8 +44,8 @@ Status CudaDequantizeWithBias(const int32_t* quantize, const T* bias, T* output, return Status::OK(); } -template Status CudaDequantizeWithBias(const int32_t* quantize, const float* bias, float* output, float scale, int m, int n); -template Status CudaDequantizeWithBias(const int32_t* quantize, const half* bias, half* output, half scale, int m, int n); +template Status CudaDequantizeWithBias(cudaStream_t stream, const int32_t* quantize, const float* bias, float* output, float scale, int m, int n); +template Status CudaDequantizeWithBias(cudaStream_t stream, const int32_t* quantize, const half* bias, half* output, half scale, int m, int n); } // namespace cuda } // namespace contrib diff --git a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization_impl.cuh b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization_impl.cuh index dc0ba262fa..b1aa2b9226 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization_impl.cuh +++ b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization_impl.cuh @@ -8,7 +8,7 @@ namespace onnxruntime { namespace contrib { namespace cuda { template -Status CudaDequantizeWithBias(const int32_t* quantize, const Tin* bias, Tin* output, Tin scale, int m, int n); +Status CudaDequantizeWithBias(cudaStream_t stream, const int32_t* quantize, const Tin* bias, Tin* output, Tin scale, int m, int n); } // namespace cuda } // namespace contrib diff --git a/onnxruntime/contrib_ops/cuda/tensor/crop.cc b/onnxruntime/contrib_ops/cuda/tensor/crop.cc index 66e022e3c4..76495c8b23 100644 --- a/onnxruntime/contrib_ops/cuda/tensor/crop.cc +++ b/onnxruntime/contrib_ops/cuda/tensor/crop.cc @@ -56,6 +56,7 @@ Status Crop::ComputeInternal(OpKernelContext* context) const { fast_divmod fdm_YHW(gsl::narrow_cast((bottomLimit - topBorder) * (rightLimit - leftBorder))); CropImpl( + Stream(), reinterpret_cast(X->template Data()), gsl::narrow_cast(leftBorder), gsl::narrow_cast(topBorder), diff --git a/onnxruntime/contrib_ops/cuda/tensor/crop_impl.cu b/onnxruntime/contrib_ops/cuda/tensor/crop_impl.cu index c69c274cce..e407164e37 100644 --- a/onnxruntime/contrib_ops/cuda/tensor/crop_impl.cu +++ b/onnxruntime/contrib_ops/cuda/tensor/crop_impl.cu @@ -31,6 +31,7 @@ __global__ void _CropKernel( template void CropImpl( + cudaStream_t stream, const T* input_data, const int src_start_x, const int src_start_y, @@ -41,12 +42,12 @@ void CropImpl( T* output_data, const size_t N) { int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _CropKernel<<>>( + _CropKernel<<>>( input_data, src_start_x, src_start_y, src_w, src_hw, fdm_dst_w, fdm_dst_hw, output_data, (CUDA_LONG)N); } #define SPECIALIZED_IMPL(T) \ - template void CropImpl(const T* input_data, const int src_start_x, const int src_start_y, const int src_w, const int src_hw, const fast_divmod& fdm_dst_w, const fast_divmod& fdm_dst_hw, T* output_data, const size_t N); + template void CropImpl(cudaStream_t stream, const T* input_data, const int src_start_x, const int src_start_y, const int src_w, const int src_hw, const fast_divmod& fdm_dst_w, const fast_divmod& fdm_dst_hw, T* output_data, const size_t N); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/contrib_ops/cuda/tensor/crop_impl.h b/onnxruntime/contrib_ops/cuda/tensor/crop_impl.h index 07ffb64d2d..8eb649a48c 100644 --- a/onnxruntime/contrib_ops/cuda/tensor/crop_impl.h +++ b/onnxruntime/contrib_ops/cuda/tensor/crop_impl.h @@ -12,6 +12,7 @@ using namespace onnxruntime::cuda; template void CropImpl( + cudaStream_t stream, const T* input_data, const int src_start_x, const int src_start_y, diff --git a/onnxruntime/contrib_ops/cuda/tensor/image_scaler.cc b/onnxruntime/contrib_ops/cuda/tensor/image_scaler.cc index bf1f33e84a..fb592a5cf3 100644 --- a/onnxruntime/contrib_ops/cuda/tensor/image_scaler.cc +++ b/onnxruntime/contrib_ops/cuda/tensor/image_scaler.cc @@ -30,7 +30,7 @@ ImageScaler::ImageScaler(const OpKernelInfo& info) : CudaKernel(info) { ORT_ENFORCE(info.GetAttrs("bias", bias_).IsOK()); b_data_ = GetScratchBuffer(bias_.size()); - CUDA_CALL_THROW(cudaMemcpy(b_data_.get(), bias_.data(), sizeof(float) * bias_.size(), cudaMemcpyHostToDevice)); + CUDA_CALL_THROW(cudaMemcpyAsync(b_data_.get(), bias_.data(), sizeof(float) * bias_.size(), cudaMemcpyHostToDevice, Stream())); } template @@ -53,6 +53,7 @@ Status ImageScaler::ComputeInternal(OpKernelContext* context) const { typedef typename ToCudaType::MappedType CudaT; ImageScalerImpl( + Stream(), reinterpret_cast(X->template Data()), scale_, b_data_.get(), diff --git a/onnxruntime/contrib_ops/cuda/tensor/image_scaler_impl.cu b/onnxruntime/contrib_ops/cuda/tensor/image_scaler_impl.cu index d0eb35d267..a63cd4755c 100644 --- a/onnxruntime/contrib_ops/cuda/tensor/image_scaler_impl.cu +++ b/onnxruntime/contrib_ops/cuda/tensor/image_scaler_impl.cu @@ -30,6 +30,7 @@ __global__ void _ImageScalerKernel( template void ImageScalerImpl( + cudaStream_t stream, const T* input_data, const float scale, const float* bias_data, @@ -40,17 +41,17 @@ void ImageScalerImpl( fast_divmod fdm_HW((int)(dims[2] * dims[3])); fast_divmod fdm_C; if (dims[0] == 1) { - _ImageScalerKernel<<>>( + _ImageScalerKernel<<>>( input_data, scale, bias_data, fdm_C, fdm_HW, output_data, N); } else { fdm_C = fast_divmod((int)dims[1]); - _ImageScalerKernel<<>>( + _ImageScalerKernel<<>>( input_data, scale, bias_data, fdm_C, fdm_HW, output_data, N); } } #define SPECIALIZED_IMPL(T) \ - template void ImageScalerImpl(const T* input_data, const float scale, const float* bias_data, const int64_t dims[4], T* output_data, const size_t N); + template void ImageScalerImpl(cudaStream_t stream, const T* input_data, const float scale, const float* bias_data, const int64_t dims[4], T* output_data, const size_t N); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/contrib_ops/cuda/tensor/image_scaler_impl.h b/onnxruntime/contrib_ops/cuda/tensor/image_scaler_impl.h index c014870894..7194041a71 100644 --- a/onnxruntime/contrib_ops/cuda/tensor/image_scaler_impl.h +++ b/onnxruntime/contrib_ops/cuda/tensor/image_scaler_impl.h @@ -10,6 +10,7 @@ namespace cuda { template void ImageScalerImpl( + cudaStream_t stream, const T* input_data, const float scale, const float* bias_data, diff --git a/onnxruntime/contrib_ops/rocm/math/bias_softmax.cc b/onnxruntime/contrib_ops/rocm/math/bias_softmax.cc index 9d318a3edf..954474aa09 100644 --- a/onnxruntime/contrib_ops/rocm/math/bias_softmax.cc +++ b/onnxruntime/contrib_ops/rocm/math/bias_softmax.cc @@ -15,6 +15,7 @@ namespace rocm { template void DispatchBiasSoftmaxForwardImpl( + hipStream_t stream, Tensor* output_tensor, const Tensor* input_tensor, const Tensor* input_bias_tensor, @@ -25,6 +26,7 @@ void DispatchBiasSoftmaxForwardImpl( template void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( + hipStream_t stream, miopenHandle_t miopenHandle, int element_count, int batch_count, @@ -67,12 +69,12 @@ Status BiasSoftmax::ComputeInternal(OpKernelContext* ctx) const { // expect thread blocks can fill SM at high occupancy without overflowing registers utils::MLTypeCallDispatcher t_disp(X->GetElementType()); - t_disp.Invoke(Y, X, B, D, N, D, broadcast_size); + t_disp.Invoke(Stream(), Y, X, B, D, N, D, broadcast_size); } else { // need to fallback to add kernel + CUDA DNN library softmax call :/ utils::MLTypeCallDispatcher t_disp(X->GetElementType()); - t_disp.Invoke(MiopenHandle(), D, N, broadcast_axis, softmax_axis, X_shape, X, B_shape, B, Y); + t_disp.Invoke(Stream(), MiopenHandle(), D, N, broadcast_axis, softmax_axis, X_shape, X, B_shape, B, Y); } return Status::OK(); @@ -80,6 +82,7 @@ Status BiasSoftmax::ComputeInternal(OpKernelContext* ctx) const { template void DispatchBiasSoftmaxForward::operator()( + hipStream_t stream, Tensor* output, const Tensor* input, const Tensor* input_bias, @@ -88,6 +91,7 @@ void DispatchBiasSoftmaxForward::operator()( int batch_stride, int bias_broadcast_size_per_batch) { DispatchBiasSoftmaxForwardImpl( + stream, output, input, input_bias, @@ -99,6 +103,7 @@ void DispatchBiasSoftmaxForward::operator()( template void DispatchBiasSoftMaxForwardViaDnnLibrary::operator()( + hipStream_t stream, miopenHandle_t miopenHandle, int element_count, int batch_count, @@ -110,6 +115,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibrary::operator()( const onnxruntime::Tensor* B, onnxruntime::Tensor* Y) { DispatchBiasSoftMaxForwardViaDnnLibraryImpl( + stream, miopenHandle, element_count, batch_count, diff --git a/onnxruntime/contrib_ops/rocm/math/bias_softmax.h b/onnxruntime/contrib_ops/rocm/math/bias_softmax.h index 04bc4d93b0..602ac5fafb 100644 --- a/onnxruntime/contrib_ops/rocm/math/bias_softmax.h +++ b/onnxruntime/contrib_ops/rocm/math/bias_softmax.h @@ -13,6 +13,7 @@ namespace rocm { template struct DispatchBiasSoftmaxForward { void operator()( + hipStream_t stream, Tensor* output, const Tensor* input, const Tensor* input_bias, @@ -25,6 +26,7 @@ struct DispatchBiasSoftmaxForward { template struct DispatchBiasSoftMaxForwardViaDnnLibrary { void operator()( + hipStream_t stream, miopenHandle_t miopenHandle, int element_count, int batch_count, diff --git a/onnxruntime/contrib_ops/rocm/math/bias_softmax_impl.cu b/onnxruntime/contrib_ops/rocm/math/bias_softmax_impl.cu index e8aad12e68..6bbc2c98d9 100644 --- a/onnxruntime/contrib_ops/rocm/math/bias_softmax_impl.cu +++ b/onnxruntime/contrib_ops/rocm/math/bias_softmax_impl.cu @@ -128,6 +128,7 @@ __global__ void BiasSoftmaxWarpForward( template void DispatchBiasSoftmaxForwardImpl( + hipStream_t stream, Tensor* output_tensor, const Tensor* input_tensor, const Tensor* input_bias_tensor, @@ -168,47 +169,47 @@ void DispatchBiasSoftmaxForwardImpl( // Launch code would be more elegant if C++ supported FOR CONSTEXPR switch (log2_elements) { case 0: // 1 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 1: // 2 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 2: // 4 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 3: // 8 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 4: // 16 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 5: // 32 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 6: // 64 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 7: // 128 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 8: // 256 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 9: // 512 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 10: // 1024 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; default: @@ -218,6 +219,7 @@ void DispatchBiasSoftmaxForwardImpl( #define SPECIALIZED_BIAS_SOFTMAX_IMPL(T) \ template void DispatchBiasSoftmaxForwardImpl( \ + hipStream_t stream, \ Tensor * output_tensor, \ const Tensor* input_tensor, \ const Tensor* input_bias_tensor, \ @@ -234,6 +236,7 @@ SPECIALIZED_BIAS_SOFTMAX_IMPL(MLFloat16) // note: This is an unhappy path! There is no performance benefit for the fusion. template void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( + hipStream_t stream, miopenHandle_t miopenHandle, int element_count, int batch_count, @@ -278,6 +281,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( // invoke elementwise add with broadcast kernel ::onnxruntime::rocm::BinaryElementWiseImpl( + stream, (int32_t)X_shape.NumDimensions(), &lhs_padded_strides, X_data, @@ -311,6 +315,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( #define SPECIALIZED_BIAS_SOFTMAX_IMPL_VIA_DNN(T) \ template void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( \ + hipStream_t stream, \ miopenHandle_t miopenHandle, \ int element_count, \ int batch_count, \ diff --git a/onnxruntime/core/framework/provider_bridge_ort.cc b/onnxruntime/core/framework/provider_bridge_ort.cc index 5e6a1f85eb..a12b5989d7 100644 --- a/onnxruntime/core/framework/provider_bridge_ort.cc +++ b/onnxruntime/core/framework/provider_bridge_ort.cc @@ -160,14 +160,16 @@ struct ProviderHostImpl : ProviderHost { return onnxruntime::make_unique(device_id, name); } - std::unique_ptr CreateGPUDataTransfer() override { return onnxruntime::make_unique(); } - - void cuda__Impl_Cast(const int64_t* input_data, int32_t* output_data, size_t count) override { - return cuda::Impl_Cast(input_data, output_data, count); + std::unique_ptr CreateGPUDataTransfer(void* stream) override { + return onnxruntime::make_unique(static_cast(stream)); } - void cuda__Impl_Cast(const int32_t* input_data, int64_t* output_data, size_t count) override { - return cuda::Impl_Cast(input_data, output_data, count); + void cuda__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) override { + return cuda::Impl_Cast(static_cast(stream), input_data, output_data, count); + } + + void cuda__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) override { + return cuda::Impl_Cast(static_cast(stream), input_data, output_data, count); } bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return CudaCall(cudaError(retCode), exprString, libName, cudaError(successCode), msg); } @@ -684,6 +686,13 @@ std::shared_ptr CreateExecutionProviderFactory_Tensor return nullptr; } +std::shared_ptr CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* provider_options) { + if (auto provider = s_library_tensorrt.Get()) + return provider->CreateExecutionProviderFactory(provider_options); + + return nullptr; +} + std::shared_ptr CreateExecutionProviderFactory_OpenVINO(const OrtOpenVINOProviderOptions* provider_options) { if (auto provider = s_library_openvino.Get()) return provider->CreateExecutionProviderFactory(provider_options); @@ -719,6 +728,16 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Tensorrt, _In_ OrtS return nullptr; } +ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT, _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options) { + auto factory = onnxruntime::CreateExecutionProviderFactory_Tensorrt(tensorrt_options); + if (!factory) { + return OrtApis::CreateStatus(ORT_FAIL, "SessionOptionsAppendExecutionProvider_Tensorrt: Failed to load shared library"); + } + + options->provider_factories.push_back(factory); + return nullptr; +} + ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_OpenVINO, _In_ OrtSessionOptions* options, _In_ const OrtOpenVINOProviderOptions* provider_options) { auto factory = onnxruntime::CreateExecutionProviderFactory_OpenVINO(provider_options); if (!factory) { diff --git a/onnxruntime/core/providers/cpu/controlflow/loop.cc b/onnxruntime/core/providers/cpu/controlflow/loop.cc index f352f28746..23d8ab6427 100644 --- a/onnxruntime/core/providers/cpu/controlflow/loop.cc +++ b/onnxruntime/core/providers/cpu/controlflow/loop.cc @@ -178,7 +178,8 @@ class LoopImpl { LoopImpl(OpKernelContextInternal& context, const SessionState& session_state, const Loop::Info& info, - const Loop::ConcatOutput& concat_output_func); + const Loop::ConcatOutput& concat_output_func, + void* stream); // Initialize by validating all the inputs, and allocating the output tensors Status Initialize(); @@ -211,9 +212,11 @@ class LoopImpl { std::vector> loop_output_tensors_; const Loop::ConcatOutput& concat_output_func_; + void* stream_; }; -static Status ConcatenateCpuOutput(std::vector& per_iteration_output, +static Status ConcatenateCpuOutput(void* /*stream*/, + std::vector& per_iteration_output, void* output, size_t output_size_in_bytes) { const auto& first_output = per_iteration_output.front().Get(); const auto& per_iteration_shape = first_output.Shape(); @@ -253,6 +256,7 @@ Loop::Loop(const OpKernelInfo& info) : IControlFlowKernel(info) { ORT_IGNORE_RETURN_VALUE(proto); concat_output_func_ = ConcatenateCpuOutput; + stream_ = nullptr; } // we need this to be in the .cc so 'unique_ptr info_' can be handled @@ -345,7 +349,7 @@ Status Loop::Compute(OpKernelContext* ctx) const { ORT_ENFORCE(session_state, "Subgraph SessionState was not found for 'body' attribute."); ORT_ENFORCE(feeds_fetches_manager_, "CreateFeedsFetchesManager must be called prior to execution of graph."); - LoopImpl loop_impl{*ctx_internal, *session_state, *info_, concat_output_func_}; + LoopImpl loop_impl{*ctx_internal, *session_state, *info_, concat_output_func_, stream_}; auto status = loop_impl.Initialize(); ORT_RETURN_IF_ERROR(status); @@ -358,12 +362,14 @@ Status Loop::Compute(OpKernelContext* ctx) const { LoopImpl::LoopImpl(OpKernelContextInternal& context, const SessionState& session_state, const Loop::Info& subgraph_info, - const Loop::ConcatOutput& concat_output_func) + const Loop::ConcatOutput& concat_output_func, + void* stream) : context_(context), session_state_(session_state), info_(subgraph_info), implicit_inputs_(context_.GetImplicitInputs()), - concat_output_func_(concat_output_func) { + concat_output_func_(concat_output_func), + stream_(stream) { auto* max_trip_count_tensor = context.Input(0); max_trip_count_ = max_trip_count_tensor ? *max_trip_count_tensor->Data() : INT64_MAX; @@ -457,7 +463,7 @@ Status LoopImpl::ConcatenateLoopOutput(std::vector& per_iteration_outp TensorShape output_shape{dims}; Tensor* output = context_.Output(output_index, output_shape); - ORT_RETURN_IF_ERROR(concat_output_func_(per_iteration_output, output->MutableDataRaw(), output->SizeInBytes())); + ORT_RETURN_IF_ERROR(concat_output_func_(stream_, per_iteration_output, output->MutableDataRaw(), output->SizeInBytes())); return Status::OK(); } diff --git a/onnxruntime/core/providers/cpu/controlflow/loop.h b/onnxruntime/core/providers/cpu/controlflow/loop.h index 958dd162aa..73f863b22d 100644 --- a/onnxruntime/core/providers/cpu/controlflow/loop.h +++ b/onnxruntime/core/providers/cpu/controlflow/loop.h @@ -29,17 +29,19 @@ class Loop : public controlflow::IControlFlowKernel { // function to concatenate the OrtValue instances from each Loop iteration into a single output buffer. // @param per_iteration_output OrtValue instances from each iteration. Never empty. All should have the same shape. // @param output Pre-allocated output buffer. On device specific to the ExecutionProvider running the Loop node. - using ConcatOutput = std::function& per_iteration_output, + using ConcatOutput = std::function& per_iteration_output, void* output, size_t output_size_in_bytes)>; protected: // derived class can provide implementation for handling concatenation of Loop output on a different device void SetConcatOutputFunc(const ConcatOutput& concat_output_func) { concat_output_func_ = concat_output_func; } + void SetComputeStream(void* stream) { stream_ = stream; } private: // Info and FeedsFetchesManager re-used for each subgraph execution. std::unique_ptr info_; std::unique_ptr feeds_fetches_manager_; ConcatOutput concat_output_func_; + void* stream_; }; } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc index c70b32fc95..af83133cee 100644 --- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc +++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc @@ -14,7 +14,7 @@ namespace DeviceHelpers { namespace CpuDeviceHelpers { // CPU specific Data copy helper -Status DataCopy(const Tensor& input, Tensor& output) { +Status DataCopy(const Tensor& input, Tensor& output, void* /*einsum_cuda_assets*/) { ORT_ENFORCE(output.SizeInBytes() == input.SizeInBytes(), "Einsum op: The candidate output does not match the actual output's shape"); // There are no string tensors in Einsum's case - so safely use memcpy @@ -156,7 +156,7 @@ static std::unique_ptr DiagonalInnermostDims(const Tensor& input, return output; } -std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator) { +std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator, void* /*einsum_cuda_assets*/) { const auto& input_shape = input.Shape(); const auto& input_dims = input_shape.GetDims(); auto rank = static_cast(input_dims.size()); diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.h b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.h index baaf6821d7..d5f96a79b9 100644 --- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.h +++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.h @@ -23,7 +23,7 @@ namespace EinsumOp { namespace DeviceHelpers { // Data copy op - Copies raw data from the source tensor's buffer to the destination tensor's buffer -using DataCopy = std::function; +using DataCopy = std::function; // Transpose op - Transposes given input based on data in `permutation` using Transpose = std::function& permutation, const Tensor& input, @@ -54,12 +54,12 @@ using ReduceSum = std::function(const Tensor& input, int64_t dim_1, int64_t dim_2, - AllocatorPtr allocator)>; + AllocatorPtr allocator, void* einsum_cuda_assets)>; // These are CPU specific device helper implementations namespace CpuDeviceHelpers { -Status DataCopy(const Tensor& input, Tensor& output); +Status DataCopy(const Tensor& input, Tensor& output, void* einsum_cuda_assets); Status Transpose(const std::vector& permutation, const Tensor& input, Tensor& output, const TensorShape* input_shape_override, void* einsum_cuda_assets); @@ -76,7 +76,7 @@ Tensor ReduceSum(const Tensor& input, const std::vector& reduce_axes, const TensorShape* input_shape_override, concurrency::ThreadPool* tp, void* einsum_cuda_assets); -std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator); +std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator, void* einsum_cuda_assets); } // namespace CpuDeviceHelpers diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.cc b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.cc index 91c700568a..88ef7738b2 100644 --- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.cc +++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.cc @@ -440,7 +440,7 @@ Status EinsumComputePreprocessor::PreprocessInputs() { preprocessed = device_diagonal_func_(preprocessed ? *preprocessed : *inputs_[input_iter], subscript_indices_to_input_index[subscript_index], dim_index_in_preprocessed_input, - allocator_); + allocator_, einsum_ep_assets_); } ++dim_index_in_original_input; } diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc index 1c55b9d7c8..0d6bb37ba3 100644 --- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc +++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc @@ -58,13 +58,13 @@ void EinsumTypedComputeProcessor::FinalizeOutput(const Tensor& candidate_outp // into the buffer of the actual output given to us by the execution frame // We need to do this because the buffer owned by the output tensor of the op could be user provided buffer - auto status = device_data_copy_func_(*candidate_output_transposed, output); + auto status = device_data_copy_func_(*candidate_output_transposed, output, einsum_ep_assets_); ORT_ENFORCE(status.IsOK(), "Einsum op: Could not copy the intermediate output's buffer into the op's output buffer. Error: ", status.ErrorMessage()); } else { // Copy the output candidate into the op's output - auto status = device_data_copy_func_(candidate_output, output); + auto status = device_data_copy_func_(candidate_output, output, einsum_ep_assets_); ORT_ENFORCE(status.IsOK(), "Einsum op: Could not copy the intermediate output's buffer into the op's output buffer. Error: ", status.ErrorMessage()); } diff --git a/onnxruntime/core/providers/cuda/activation/activations.cc b/onnxruntime/core/providers/cuda/activation/activations.cc index cb985fd024..5015c40109 100644 --- a/onnxruntime/core/providers/cuda/activation/activations.cc +++ b/onnxruntime/core/providers/cuda/activation/activations.cc @@ -38,6 +38,7 @@ namespace cuda { ORT_RETURN_IF_ERROR(UnaryElementwise::Prepare(context, &p)); \ Ctx##x func_ctx = MakeFuncCtx(); \ Impl_##x::MappedType>( \ + Stream(), \ reinterpret_cast::MappedType*>(p.input_tensor->template Data()), \ reinterpret_cast::MappedType*>(p.output_tensor->template MutableData()), \ &func_ctx, p.output_tensor->Shape().Size()); \ diff --git a/onnxruntime/core/providers/cuda/activation/activations_impl.cu b/onnxruntime/core/providers/cuda/activation/activations_impl.cu index bd7a4f7bf8..2ff5a4748f 100644 --- a/onnxruntime/core/providers/cuda/activation/activations_impl.cu +++ b/onnxruntime/core/providers/cuda/activation/activations_impl.cu @@ -84,14 +84,15 @@ struct OP_ThresholdedRelu : public CtxThresholdedRelu { #define UNARY_ACTIVATION_IMPL(name) \ UNARY_ACTIVATION_IMPL_DECLARATION(name) { \ - UnaryElementWiseImpl(input_data, \ + UnaryElementWiseImpl(stream, \ + input_data, \ output_data, \ *reinterpret_cast*>(func_ctx), \ count); \ } #define SPECIALIZED_UNARY_ACTIVATION_IMPL(name, T) \ - template void Impl_##name(const T* input_data, T* output_data, const Ctx##name* func_ctx, size_t count); + template void Impl_##name(cudaStream_t stream, const T* input_data, T* output_data, const Ctx##name* func_ctx, size_t count); #if CUDA_VERSION >= 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) #define SPECIALIZED_UNARY_ACTIVATION_IMPL_BF16(name) SPECIALIZED_UNARY_ACTIVATION_IMPL(name, nv_bfloat16) diff --git a/onnxruntime/core/providers/cuda/activation/activations_impl.h b/onnxruntime/core/providers/cuda/activation/activations_impl.h index a3a39df63b..53359ae7a7 100644 --- a/onnxruntime/core/providers/cuda/activation/activations_impl.h +++ b/onnxruntime/core/providers/cuda/activation/activations_impl.h @@ -48,6 +48,7 @@ typedef CtxAlpha CtxThresholdedRelu; #define UNARY_ACTIVATION_IMPL_DECLARATION(name) \ template \ void Impl_##name( \ + cudaStream_t stream, \ const T* input_data, \ T* output_data, \ const Ctx##name* func_ctx, \ diff --git a/onnxruntime/core/providers/cuda/controlflow/loop.cc b/onnxruntime/core/providers/cuda/controlflow/loop.cc index 12779aca3d..5d430ecfa6 100644 --- a/onnxruntime/core/providers/cuda/controlflow/loop.cc +++ b/onnxruntime/core/providers/cuda/controlflow/loop.cc @@ -51,7 +51,7 @@ ONNX_OPERATOR_KERNEL_EX(Loop, .TypeConstraint("V", DataTypeImpl::AllFixedSizeTensorTypes()), Loop); -static Status ConcatenateGpuOutput(std::vector& per_iteration_output, +static Status ConcatenateGpuOutput(void* stream, std::vector& per_iteration_output, void* output, ptrdiff_t output_size_in_bytes) { const auto& first_output = per_iteration_output.front().Get(); const auto& per_iteration_shape = first_output.Shape(); @@ -68,8 +68,8 @@ static Status ConcatenateGpuOutput(std::vector& per_iteration_output, " Expected:", per_iteration_shape, " Got:", iteration_data.Shape()); } - CUDA_RETURN_IF_ERROR(cudaMemcpy(cur_output, iteration_data.DataRaw(), bytes_per_iteration, - cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cur_output, iteration_data.DataRaw(), bytes_per_iteration, + cudaMemcpyDeviceToDevice, static_cast(stream))); cur_output = static_cast((static_cast(cur_output) + bytes_per_iteration)); } @@ -82,6 +82,7 @@ static Status ConcatenateGpuOutput(std::vector& per_iteration_output, Loop::Loop(const OpKernelInfo& info) : onnxruntime::Loop(info) { SetConcatOutputFunc(ConcatenateGpuOutput); + SetComputeStream(static_cast(info.GetExecutionProvider()->GetComputeStream())); } Status Loop::Compute(OpKernelContext* ctx) const { diff --git a/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh b/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh index 379b09fe39..069cf0658d 100644 --- a/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh +++ b/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh @@ -180,6 +180,7 @@ __global__ void _BinaryElementWiseRhsPerChannelBatchN( template void BinaryElementWiseNoBroadcastImpl( + cudaStream_t stream, const T1* lhs_data, const T2* rhs_data, T* output_data, @@ -190,7 +191,7 @@ void BinaryElementWiseNoBroadcastImpl( int blocksPerGrid = static_cast(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); CUDA_LONG N = static_cast(count); - _BinaryElementWiseSimple<<>>( + _BinaryElementWiseSimple<<>>( lhs_data, rhs_data, output_data, @@ -200,6 +201,7 @@ void BinaryElementWiseNoBroadcastImpl( template void BinaryElementWiseImpl( + cudaStream_t stream, int32_t output_rank_or_simple_broadcast, const TArray* lhs_padded_strides, const T1* lhs_data, @@ -217,14 +219,14 @@ void BinaryElementWiseImpl( int blocksPerGrid = static_cast(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); CUDA_LONG N = static_cast(count); if (output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::NoBroadcast)) { - _BinaryElementWiseSimple<<>>( + _BinaryElementWiseSimple<<>>( lhs_data, rhs_data, output_data, func, N); } else if (output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::LeftScalar)) { - _BinaryElementWiseSimple<<>>( + _BinaryElementWiseSimple<<>>( lhs_data, rhs_data, output_data, @@ -232,14 +234,14 @@ void BinaryElementWiseImpl( N); } else if (output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::RightScalar)) { _BinaryElementWiseSimple<<>>( + GridDim::maxElementsPerThread><<>>( lhs_data, rhs_data, output_data, func, N); } else if (output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::RightPerChannelBatch1)) { - _BinaryElementWiseRhsPerChannelBatch1<<>>( + _BinaryElementWiseRhsPerChannelBatch1<<>>( lhs_data, rhs_data, fdm_H, @@ -247,7 +249,7 @@ void BinaryElementWiseImpl( func, N); } else if (output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::RightPerChannelBatchN)) { - _BinaryElementWiseRhsPerChannelBatchN<<>>( + _BinaryElementWiseRhsPerChannelBatchN<<>>( lhs_data, rhs_data, fdm_H, @@ -257,7 +259,7 @@ void BinaryElementWiseImpl( N); } else { if (lhs_padded_strides && rhs_padded_strides && lhs_padded_strides->Size() && rhs_padded_strides->Size()) - _BinaryElementWise<<>>( + _BinaryElementWise<<>>( output_rank_or_simple_broadcast, *lhs_padded_strides, lhs_data, @@ -268,7 +270,7 @@ void BinaryElementWiseImpl( func, N); else if (lhs_padded_strides && lhs_padded_strides->Size()) - _BinaryElementWise<<>>( + _BinaryElementWise<<>>( output_rank_or_simple_broadcast, *lhs_padded_strides, lhs_data, @@ -279,7 +281,7 @@ void BinaryElementWiseImpl( func, N); else if (rhs_padded_strides && rhs_padded_strides->Size()) - _BinaryElementWise<<>>( + _BinaryElementWise<<>>( output_rank_or_simple_broadcast, TArray(), // lhs is not computed, so no need to deference lhs_padded_strides lhs_data, diff --git a/onnxruntime/core/providers/cuda/cu_inc/unary_elementwise_impl.cuh b/onnxruntime/core/providers/cuda/cu_inc/unary_elementwise_impl.cuh index 04d04e2488..66113a1dff 100644 --- a/onnxruntime/core/providers/cuda/cu_inc/unary_elementwise_impl.cuh +++ b/onnxruntime/core/providers/cuda/cu_inc/unary_elementwise_impl.cuh @@ -39,6 +39,7 @@ __global__ void _UnaryElementWise( template void UnaryElementWiseImpl( + cudaStream_t stream, const InT* input_data, OutT* output_data, const FuncT& func, @@ -49,7 +50,7 @@ void UnaryElementWiseImpl( int blocksPerGrid = static_cast(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); CUDA_LONG N = static_cast(count); _UnaryElementWise - <<>>( + <<>>( input_data, output_data, func, diff --git a/onnxruntime/core/providers/cuda/cu_inc/variadic_elementwise_impl.cuh b/onnxruntime/core/providers/cuda/cu_inc/variadic_elementwise_impl.cuh index cd4269542a..e3a3c6a969 100644 --- a/onnxruntime/core/providers/cuda/cu_inc/variadic_elementwise_impl.cuh +++ b/onnxruntime/core/providers/cuda/cu_inc/variadic_elementwise_impl.cuh @@ -62,6 +62,7 @@ __global__ void VariadicElementWiseNoBroadcastInputBatchKernel( // - inputs and output have N elements template void VariadicElementWiseNoBroadcastInputBatchImpl( + cudaStream_t stream, Func func, size_t N, TArray inputs, @@ -70,7 +71,7 @@ void VariadicElementWiseNoBroadcastInputBatchImpl( constexpr int32_t threads_per_block = GridDim::maxThreadsPerBlock; const int32_t blocks_per_grid = static_cast(CeilDiv(N, elements_per_thread * threads_per_block)); VariadicElementWiseNoBroadcastInputBatchKernel - <<>>(func, N, inputs, output); + <<>>(func, N, inputs, output); } } // namespace cuda diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 17cd8c1336..ff542b78cb 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -59,10 +59,15 @@ ONNX_OPERATOR_KERNEL_EX( } // namespace cuda -CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy) { +CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, cudaStream_t stream, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy) { CUDA_CALL_THROW(cudaSetDevice(device_id)); + stream_ = stream; + CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_)); + CUBLAS_CALL_THROW(cublasSetStream(cublas_handle_, stream)); + CUDNN_CALL_THROW(cudnnCreate(&cudnn_handle_)); + CUDNN_CALL_THROW(cudnnSetStream(cudnn_handle_, stream)); AllocatorCreationInfo default_memory_info( [](OrtDevice::DeviceId id) { @@ -103,6 +108,12 @@ CUDAExecutionProvider::CUDAExecutionProvider(const CUDAExecutionProviderInfo& in // must wait GPU idle, otherwise cudaGetDeviceProperties might fail CUDA_CALL_THROW(cudaDeviceSynchronize()); CUDA_CALL_THROW(cudaGetDeviceProperties(&device_prop_, info_.device_id)); + if (info.has_user_compute_stream) { + external_stream_ = true; + stream_ = static_cast(info.user_compute_stream); + } else { + CUDA_CALL_THROW(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); + } size_t free = 0; size_t total = 0; @@ -136,6 +147,10 @@ CUDAExecutionProvider::~CUDAExecutionProvider() { ORT_IGNORE_RETURN_VALUE(cache->erase(this)); } } + + if (!external_stream_ && stream_) { + CUDA_CALL(cudaStreamDestroy(stream_)); + } } CUDAExecutionProvider::PerThreadContext& CUDAExecutionProvider::GetPerThreadContext() const { @@ -156,7 +171,7 @@ CUDAExecutionProvider::PerThreadContext& CUDAExecutionProvider::GetPerThreadCont // get or create a context if (context_state_.retired_context_pool.empty()) { - context = std::make_shared(info_.device_id, info_.cuda_mem_limit, info_.arena_extend_strategy); + context = std::make_shared(info_.device_id, static_cast(GetComputeStream()), info_.cuda_mem_limit, info_.arena_extend_strategy); } else { context = context_state_.retired_context_pool.back(); context_state_.retired_context_pool.pop_back(); @@ -254,10 +269,24 @@ Status CUDAExecutionProvider::OnRunStart() { Status CUDAExecutionProvider::OnRunEnd() { // record deferred release event on default stream, and release per_thread_context auto current_deferred_release_event = GetPerThreadContext().GetCurrentDeferredReleaseEvent(); - CUDA_RETURN_IF_ERROR(cudaEventRecord(current_deferred_release_event, nullptr)); + CUDA_RETURN_IF_ERROR(cudaEventRecord(current_deferred_release_event, static_cast(GetComputeStream()))); + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(static_cast(GetComputeStream()))); ReleasePerThreadContext(); std::lock_guard lock(deferred_release_cpu_ptr_mutex_); deferred_release_cpu_ptr_[current_deferred_release_event].recorded = true; + + return Status::OK(); +} + +Status CUDAExecutionProvider::SetComputeStream(void* stream) { + if (stream != stream_) { + if (stream_) { + CUDA_RETURN_IF_ERROR(cudaStreamDestroy(stream_)); + } + + external_stream_ = true; + stream_ = static_cast(stream); + } return Status::OK(); } @@ -1878,7 +1907,7 @@ static bool CastNeedFallbackToCPU(const onnxruntime::Node& node) { } std::unique_ptr CUDAExecutionProvider::GetDataTransfer() const { - return onnxruntime::make_unique(info_.do_copy_in_default_stream); + return onnxruntime::make_unique(static_cast(GetComputeStream()), info_.do_copy_in_default_stream); } std::vector> diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h index 2f55ee6bbb..f44a341e30 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h @@ -13,8 +13,8 @@ #include "core/platform/ort_mutex.h" #include "core/providers/cuda/cuda_execution_provider_info.h" #include "core/providers/cuda/cuda_pch.h" -#include "core/providers/cuda/gpu_data_transfer.h" #include "core/providers/cuda/shared_inc/cuda_utils.h" +#include "core/providers/cuda/shared_inc/cuda_call.h" namespace onnxruntime { @@ -37,6 +37,10 @@ class CUDAExecutionProvider : public IExecutionProvider { return nullptr; } + Status SetComputeStream(void* stream) override; + + void* GetComputeStream() const override { return static_cast(stream_); } + cublasHandle_t PerThreadCublasHandle() { return GetPerThreadContext().CublasHandle(); } @@ -80,6 +84,8 @@ class CUDAExecutionProvider : public IExecutionProvider { private: CUDAExecutionProviderInfo info_; cudaDeviceProp device_prop_; + bool external_stream_ = false; + cudaStream_t stream_ = nullptr; struct DeferredReleaseCPUPtrs { bool recorded = false; std::vector cpu_ptrs; @@ -90,7 +96,7 @@ class CUDAExecutionProvider : public IExecutionProvider { class PerThreadContext final { public: - PerThreadContext(OrtDevice::DeviceId device_id, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy); + PerThreadContext(OrtDevice::DeviceId device_id, cudaStream_t stream, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy); ~PerThreadContext(); cublasHandle_t CublasHandle() const { @@ -111,23 +117,23 @@ class CUDAExecutionProvider : public IExecutionProvider { if (!constant_ones_float_) { constant_ones_float_ = cuda::CreateConstantOnes(); } - return reinterpret_cast(constant_ones_float_->GetBuffer(count)); + return reinterpret_cast(constant_ones_float_->GetBuffer(stream_, count)); } else if (std::is_same::value) { if (!constant_ones_double_) { constant_ones_double_ = cuda::CreateConstantOnes(); } - return reinterpret_cast(constant_ones_double_->GetBuffer(count)); + return reinterpret_cast(constant_ones_double_->GetBuffer(stream_, count)); } else if (std::is_same::value) { if (!constant_ones_half_) { constant_ones_half_ = cuda::CreateConstantOnes(); } - return reinterpret_cast(constant_ones_half_->GetBuffer(count)); + return reinterpret_cast(constant_ones_half_->GetBuffer(stream_, count)); #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 } else if (std::is_same::value) { if (!constant_ones_bfloat16_) { constant_ones_bfloat16_ = cuda::CreateConstantOnes(); } - return reinterpret_cast(constant_ones_bfloat16_->GetBuffer(count)); + return reinterpret_cast(constant_ones_bfloat16_->GetBuffer(stream_, count)); #endif } else { return nullptr; @@ -139,6 +145,7 @@ class CUDAExecutionProvider : public IExecutionProvider { } private: + cudaStream_t stream_ = nullptr; cublasHandle_t cublas_handle_ = nullptr; cudnnHandle_t cudnn_handle_ = nullptr; diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h index 5ba2d07b9c..0b1e7bfe86 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h @@ -18,6 +18,8 @@ struct CUDAExecutionProviderInfo { ArenaExtendStrategy arena_extend_strategy{ArenaExtendStrategy::kNextPowerOfTwo}; OrtCudnnConvAlgoSearch cudnn_conv_algo_search{OrtCudnnConvAlgoSearch::EXHAUSTIVE}; bool do_copy_in_default_stream{true}; + bool has_user_compute_stream{false}; + void* user_compute_stream{nullptr}; static CUDAExecutionProviderInfo FromProviderOptions(const ProviderOptions& options); static ProviderOptions ToProviderOptions(const CUDAExecutionProviderInfo& info); diff --git a/onnxruntime/core/providers/cuda/cuda_kernel.h b/onnxruntime/core/providers/cuda/cuda_kernel.h index 12fccc9f19..8dd0a5b781 100644 --- a/onnxruntime/core/providers/cuda/cuda_kernel.h +++ b/onnxruntime/core/providers/cuda/cuda_kernel.h @@ -59,7 +59,9 @@ class CudaKernel : public OpKernel { provider_->AddDeferredReleaseCPUPtr(p); } - const cudaDeviceProp& GetDeviceProp() const { return provider_->GetDeviceProp(); }; + const cudaDeviceProp& GetDeviceProp() const { return provider_->GetDeviceProp(); } + + inline cudaStream_t Stream() const { return static_cast(provider_->GetComputeStream()); } // To support cudaMemcpyAsync, the cpu memory should be allocated in pinned memory // and it can only be released after the copy has finished @@ -94,7 +96,7 @@ class CudaKernel : public OpKernel { Status CopyToGpu() { if (cpu_pinned_copy_) { gpu_copy_ = op_kernel_->GetScratchBuffer(count_); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(gpu_copy_.get(), cpu_pinned_copy_.get(), count_ * sizeof(T), cudaMemcpyHostToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(gpu_copy_.get(), cpu_pinned_copy_.get(), count_ * sizeof(T), cudaMemcpyHostToDevice, op_kernel_->Stream())); op_kernel_->AddDeferredReleaseCPUPtr(cpu_pinned_copy_.release()); } return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc index ef3c7a4269..6a5e8fd8a0 100644 --- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc +++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc @@ -57,7 +57,8 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_CUDA, info.arena_extend_strategy = static_cast(cuda_options->arena_extend_strategy); info.cudnn_conv_algo_search = cuda_options->cudnn_conv_algo_search; info.do_copy_in_default_stream = cuda_options->do_copy_in_default_stream; - + info.has_user_compute_stream = cuda_options->has_user_compute_stream; + info.user_compute_stream = cuda_options->user_compute_stream; options->provider_factories.push_back(onnxruntime::CreateExecutionProviderFactory_CUDA(info)); return nullptr; diff --git a/onnxruntime/core/providers/cuda/cuda_utils.cu b/onnxruntime/core/providers/cuda/cuda_utils.cu index a0ee56c6a3..c9cf75ef29 100644 --- a/onnxruntime/core/providers/cuda/cuda_utils.cu +++ b/onnxruntime/core/providers/cuda/cuda_utils.cu @@ -27,11 +27,11 @@ __global__ void _Fill( } template -void Fill(T* output, T value, int64_t count) { +void Fill(cudaStream_t stream, T* output, T value, int64_t count) { int blocksPerGrid = static_cast(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); CUDA_LONG N = static_cast(count); _Fill - <<>>(output, value, N); + <<>>(output, value, N); } template class ConstantBufferImpl : public IConstantBuffer { @@ -43,7 +43,7 @@ class ConstantBufferImpl : public IConstantBuffer { cudaFree(buffer_); } - virtual const T* GetBuffer(size_t count) { + virtual const T* GetBuffer(cudaStream_t stream, size_t count) { if (count > count_) { if (buffer_) { cudaFree(buffer_); @@ -52,7 +52,7 @@ class ConstantBufferImpl : public IConstantBuffer { CUDA_CALL_THROW(cudaMalloc(&buffer_, count * sizeof(T))); count_ = count; - Fill(buffer_, val_, count); + Fill(stream, buffer_, val_, count); } return buffer_; } @@ -76,7 +76,7 @@ template std::unique_ptr> CreateConstantOnes(T * output, T value, int64_t count); + template void Fill(cudaStream_t stream, T * output, T value, int64_t count); SPECIALIZED_FILL(int8_t) SPECIALIZED_FILL(int16_t) diff --git a/onnxruntime/core/providers/cuda/fpgeneric.cu b/onnxruntime/core/providers/cuda/fpgeneric.cu index 8f03004d89..695c6038c0 100644 --- a/onnxruntime/core/providers/cuda/fpgeneric.cu +++ b/onnxruntime/core/providers/cuda/fpgeneric.cu @@ -65,30 +65,30 @@ __global__ void CopyVectorBFloat16(const nv_bfloat16* x, int incx, nv_bfloat16* } // namespace -cublasStatus_t cublasTransposeHelper(cublasHandle_t, cublasOperation_t, cublasOperation_t, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int) { +cublasStatus_t cublasTransposeHelper(cudaStream_t stream, cublasHandle_t, cublasOperation_t, cublasOperation_t, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int) { if (C != A) { dim3 dimGrid((n + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, (m + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, 1); dim3 dimBlock(TRANS_TILE_DIM, BLOCK_ROWS, 1); - transposeNoOverlap<<>>(C, A, n, m); + transposeNoOverlap<<>>(C, A, n, m); } else { return CUBLAS_STATUS_NOT_SUPPORTED; } return CUBLAS_STATUS_SUCCESS; } -cublasStatus_t cublasCopyHelper(cublasHandle_t, int n, const half* x, int incx, half* y, int incy) { +cublasStatus_t cublasCopyHelper(cudaStream_t stream, cublasHandle_t, int n, const half* x, int incx, half* y, int incy) { dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1); dim3 dimBlock(COPY_BLOCK_DIM, 1, 1); - CopyVectorHalf<<>>(x, incx, y, incy, n); + CopyVectorHalf<<>>(x, incx, y, incy, n); return CUBLAS_STATUS_SUCCESS; } #if CUDA_VERSION >= 11000 -cublasStatus_t cublasCopyHelper(cublasHandle_t, int n, const nv_bfloat16* x, int incx, nv_bfloat16* y, int incy) { +cublasStatus_t cublasCopyHelper(cudaStream_t stream, cublasHandle_t, int n, const nv_bfloat16* x, int incx, nv_bfloat16* y, int incy) { dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1); dim3 dimBlock(COPY_BLOCK_DIM, 1, 1); - CopyVectorBFloat16<<>>(x, incx, y, incy, n); + CopyVectorBFloat16<<>>(x, incx, y, incy, n); return CUBLAS_STATUS_SUCCESS; } diff --git a/onnxruntime/core/providers/cuda/generator/constant_of_shape.cc b/onnxruntime/core/providers/cuda/generator/constant_of_shape.cc index f598a0337b..052fdf5dfe 100644 --- a/onnxruntime/core/providers/cuda/generator/constant_of_shape.cc +++ b/onnxruntime/core/providers/cuda/generator/constant_of_shape.cc @@ -32,7 +32,7 @@ Status ConstantOfShape::ComputeInternal(OpKernelContext* ctx) const { #define CASE(TYPE) \ case sizeof(TYPE): \ if (size > 0) { \ - cuda::Fill(reinterpret_cast(output_data), *(reinterpret_cast(value_ptr)), size); \ + cuda::Fill(Stream(), reinterpret_cast(output_data), *(reinterpret_cast(value_ptr)), size); \ } \ break; diff --git a/onnxruntime/core/providers/cuda/generator/range.cc b/onnxruntime/core/providers/cuda/generator/range.cc index cc57d9d2a5..bc9c94c120 100644 --- a/onnxruntime/core/providers/cuda/generator/range.cc +++ b/onnxruntime/core/providers/cuda/generator/range.cc @@ -30,7 +30,7 @@ ONNX_OPERATOR_KERNEL_EX( Range); template -static Status ComputeRange(OpKernelContext* ctx) { +static Status ComputeRange(cudaStream_t stream, OpKernelContext* ctx) { const auto& start_tensor = *ctx->Input(0); const auto& limit_tensor = *ctx->Input(1); const auto* delta_tensor_ptr = ctx->Input(2); @@ -71,7 +71,7 @@ static Status ComputeRange(OpKernelContext* ctx) { T* y = ctx->Output(0, shape)->template MutableData(); if (count > 0) { - if (!RangeImpl(start, delta, count, y)) { + if (!RangeImpl(stream, start, delta, count, y)) { CUDA_CALL(cudaGetLastError()); return Status(common::ONNXRUNTIME, common::FAIL); } @@ -84,8 +84,8 @@ namespace cuda_range_internal { template struct CallCudaRangeImpl { - Status operator()(OpKernelContext* ctx) const { - return ComputeRange(ctx); + Status operator()(cudaStream_t stream, OpKernelContext* ctx) const { + return ComputeRange(stream, ctx); } }; @@ -100,7 +100,7 @@ Status Range::ComputeInternal(OpKernelContext* ctx) const { utils::MLTypeCallDispatcherRet t_disp(input_tensor->GetElementType()); - return t_disp.Invoke(ctx); + return t_disp.Invoke(Stream(), ctx); } } // namespace cuda diff --git a/onnxruntime/core/providers/cuda/generator/range_impl.cu b/onnxruntime/core/providers/cuda/generator/range_impl.cu index 4756350392..ede8146478 100644 --- a/onnxruntime/core/providers/cuda/generator/range_impl.cu +++ b/onnxruntime/core/providers/cuda/generator/range_impl.cu @@ -22,15 +22,15 @@ __global__ void RangeKernel(const T start, const T delta, const int count, T* ou } template -bool RangeImpl(const T start, const T delta, const int count, T* output) { +bool RangeImpl(cudaStream_t stream, const T start, const T delta, const int count, T* output) { constexpr int block_size = 256; int grid_size = (count + block_size - 1) / block_size; - RangeKernel<<>>(start, delta, count, output); + RangeKernel<<>>(start, delta, count, output); return CUDA_CALL(cudaPeekAtLastError()); } #define SPECIALIZED_IMPL(T) \ - template bool RangeImpl(const T start, const T delta, const int count, T* output); + template bool RangeImpl(cudaStream_t stream, const T start, const T delta, const int count, T* output); SPECIALIZED_IMPL(int16_t) SPECIALIZED_IMPL(int32_t) diff --git a/onnxruntime/core/providers/cuda/generator/range_impl.h b/onnxruntime/core/providers/cuda/generator/range_impl.h index 684978d544..608c65223a 100644 --- a/onnxruntime/core/providers/cuda/generator/range_impl.h +++ b/onnxruntime/core/providers/cuda/generator/range_impl.h @@ -9,7 +9,7 @@ namespace cuda { template -bool RangeImpl(const T start, const T delta, const int count, T* output); +bool RangeImpl(cudaStream_t stream, const T start, const T delta, const int count, T* output); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/gpu_data_transfer.cc b/onnxruntime/core/providers/cuda/gpu_data_transfer.cc index 6618688087..cd83b3d612 100644 --- a/onnxruntime/core/providers/cuda/gpu_data_transfer.cc +++ b/onnxruntime/core/providers/cuda/gpu_data_transfer.cc @@ -9,12 +9,13 @@ // so we leave it as optional, in case user need the previous behavior // a full fix to BFC arena is being looked at, and once it's in, we can revert this change namespace onnxruntime { -GPUDataTransfer::GPUDataTransfer(bool do_copy_in_default_stream) { +GPUDataTransfer::GPUDataTransfer(cudaStream_t stream, bool do_copy_in_default_stream) { // create streams, default is nullptr - streams_[kCudaStreamDefault] = nullptr; + do_copy_in_default_stream_ = do_copy_in_default_stream; + streams_[kCudaStreamDefault] = stream; if (do_copy_in_default_stream) { - streams_[kCudaStreamCopyIn] = nullptr; - streams_[kCudaStreamCopyOut] = nullptr; + streams_[kCudaStreamCopyIn] = stream; + streams_[kCudaStreamCopyOut] = stream; } else { CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyIn], cudaStreamNonBlocking)); CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyOut], cudaStreamNonBlocking)); @@ -22,10 +23,10 @@ GPUDataTransfer::GPUDataTransfer(bool do_copy_in_default_stream) { } GPUDataTransfer::~GPUDataTransfer() { - if (streams_[kCudaStreamCopyIn] != nullptr) { + if (!do_copy_in_default_stream_ && streams_[kCudaStreamCopyIn] != nullptr) { CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyIn])); } - if (streams_[kCudaStreamCopyOut] != nullptr) { + if (!do_copy_in_default_stream_ && streams_[kCudaStreamCopyOut] != nullptr) { CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyOut])); } } @@ -46,24 +47,26 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int e if (dst_device.Type() == OrtDevice::GPU) { if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) { // copy from pinned memory to GPU, this is non-blocking - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, streams_[exec_queue_id])); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, GetStream(exec_queue_id))); } else if (src_device.Type() == OrtDevice::GPU) { // copying between GPU, this is non-blocking // Copy only if the two addresses are different. if (dst_data != src_data) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, streams_[kCudaStreamDefault])); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, GetStream(kCudaStreamDefault))); } } else { // copy from other CPU memory to GPU, this is blocking - CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, GetStream(kCudaStreamDefault))); + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(GetStream(kCudaStreamDefault))); } } else if (src_device.Type() == OrtDevice::GPU) { if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED) { // copying from GPU to pinned memory, this is non-blocking - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, streams_[exec_queue_id])); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, GetStream(exec_queue_id))); } else { // copying from GPU to CPU memory, this is blocking - CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, GetStream(kCudaStreamDefault))); + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(GetStream(kCudaStreamDefault))); } } else { // copying between cpu memory diff --git a/onnxruntime/core/providers/cuda/gpu_data_transfer.h b/onnxruntime/core/providers/cuda/gpu_data_transfer.h index 055e2a90fd..f8eeb5fa97 100644 --- a/onnxruntime/core/providers/cuda/gpu_data_transfer.h +++ b/onnxruntime/core/providers/cuda/gpu_data_transfer.h @@ -17,7 +17,7 @@ enum CUDAStreamType : int { class GPUDataTransfer : public IDataTransfer { public: - GPUDataTransfer(bool do_copy_in_default_stream = true); + GPUDataTransfer(cudaStream_t stream, bool do_copy_in_default_stream = true); ~GPUDataTransfer(); bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override; @@ -32,6 +32,7 @@ class GPUDataTransfer : public IDataTransfer { } private: + bool do_copy_in_default_stream_; cudaStream_t streams_[kTotalCudaStreams]; }; diff --git a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc index c733fb85df..52bf4c0a41 100644 --- a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc +++ b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc @@ -145,6 +145,7 @@ Status BinaryElementwise::Prepare(OpKernelContext* context, Bin BinaryElementwisePreparation prepare; \ ORT_RETURN_IF_ERROR(Prepare(context, &prepare)); \ Impl_##x::MappedType>( \ + Stream(), \ prepare.output_rank_or_simple_broadcast, \ &prepare.lhs_padded_strides, \ reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), \ @@ -315,12 +316,13 @@ ONNX_OPERATOR_KERNEL_EX( namespace pow12_internal { template -Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) { +Status DispatchOnFirstArg(cudaStream_t stream, const BinaryElementwisePreparation& prepare) { namespace on = ONNX_NAMESPACE; Status s; switch (prepare.rhs_tensor->GetElementType()) { case on::TensorProto_DataType_INT32: ImplT1_Pow::MappedType, typename ToCudaType::MappedType>( + stream, prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), @@ -334,6 +336,7 @@ Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) { break; case on::TensorProto_DataType_INT64: ImplT1_Pow::MappedType, typename ToCudaType::MappedType>( + stream, prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), @@ -347,6 +350,7 @@ Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) { break; case on::TensorProto_DataType_FLOAT: ImplT1_Pow::MappedType, typename ToCudaType::MappedType>( + stream, prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), @@ -360,6 +364,7 @@ Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) { break; case on::TensorProto_DataType_DOUBLE: ImplT1_Pow::MappedType, typename ToCudaType::MappedType>( + stream, prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), @@ -373,6 +378,7 @@ Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) { break; case on::TensorProto_DataType_FLOAT16: ImplT1_Pow::MappedType, typename ToCudaType::MappedType>( + stream, prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), @@ -402,19 +408,19 @@ Status Pow::ComputeInternal(OpKernelContext* context) const { switch (prepare.lhs_tensor->GetElementType()) { case on::TensorProto_DataType_INT32: - s = DispatchOnFirstArg(prepare); + s = DispatchOnFirstArg(Stream(), prepare); break; case on::TensorProto_DataType_INT64: - s = DispatchOnFirstArg(prepare); + s = DispatchOnFirstArg(Stream(), prepare); break; case on::TensorProto_DataType_FLOAT: - s = DispatchOnFirstArg(prepare); + s = DispatchOnFirstArg(Stream(), prepare); break; case on::TensorProto_DataType_DOUBLE: - s = DispatchOnFirstArg(prepare); + s = DispatchOnFirstArg(Stream(), prepare); break; case on::TensorProto_DataType_FLOAT16: - s = DispatchOnFirstArg(prepare); + s = DispatchOnFirstArg(Stream(), prepare); break; default: s = ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported X type: ", @@ -431,6 +437,7 @@ Status CompareFunction::CompareMethod(OpKernelContext* context, ImplCo ORT_RETURN_IF_ERROR(Prepare(context, &prepare)); Impl_Compare( + Stream(), prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast(prepare.lhs_tensor->template Data()), diff --git a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.h b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.h index 4651004b91..f4c1675aaf 100644 --- a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.h +++ b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.h @@ -219,7 +219,8 @@ class CompareFunction : public BinaryElementwise { public: CompareFunction(const OpKernelInfo& info) : BinaryElementwise(info) {} - typedef void (*ImplCompare)(int32_t output_rank_or_simple_broadcast, + typedef void (*ImplCompare)(cudaStream_t stream, + int32_t output_rank_or_simple_broadcast, const TArray* lhs_padded_strides, const CudaT* lhs_data, const TArray* rhs_padded_strides, diff --git a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.cu b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.cu index f0cb62faaa..8dc09b7fbc 100644 --- a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.cu +++ b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.cu @@ -12,7 +12,8 @@ namespace cuda { #define BINARY_ELEMENTWISE_IMPL(name) \ BINARY_ELEMENTWISE_IMPL_DECLARATION(name) { \ - BinaryElementWiseImpl(output_rank_or_simple_broadcast, \ + BinaryElementWiseImpl(stream, \ + output_rank_or_simple_broadcast, \ lhs_padded_strides, \ lhs_data, \ rhs_padded_strides, \ @@ -27,7 +28,8 @@ namespace cuda { #define BINARY_ELEMENTWISE_IMPL_T1(name) \ BINARY_ELEMENTWISE_IMPL_DECLARATION_T1(name) { \ - BinaryElementWiseImpl(output_rank_or_simple_broadcast, \ + BinaryElementWiseImpl(stream, \ + output_rank_or_simple_broadcast, \ lhs_padded_strides, \ lhs_data, \ rhs_padded_strides, \ @@ -42,7 +44,8 @@ namespace cuda { #define BINARY_ELEMENTWISE_IMPL_T2(name) \ BINARY_ELEMENTWISE_IMPL_DECLARATION_T2(name) { \ - BinaryElementWiseImpl(output_rank_or_simple_broadcast, \ + BinaryElementWiseImpl(stream, \ + output_rank_or_simple_broadcast, \ lhs_padded_strides, \ lhs_data, \ rhs_padded_strides, \ @@ -56,19 +59,22 @@ namespace cuda { } #define SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, T) \ - template void Impl_##x(int32_t output_rank, \ + template void Impl_##x(cudaStream_t stream, \ + int32_t output_rank, \ const TArray* lhs_padded_strides, const T* lhs_data, \ const TArray* rhs_padded_strides, const T* rhs_data, \ const TArray* fdm_output_strides, const fast_divmod& fdm_H, const fast_divmod& fdm_C, T* output_data, size_t count); #define SPECIALIZED_BINARY_ELEMENTWISE_IMPL_T1(x, T, T1) \ - template void ImplT1_##x(int32_t output_rank, \ + template void ImplT1_##x(cudaStream_t stream, \ + int32_t output_rank, \ const TArray* lhs_padded_strides, const T* lhs_data, \ const TArray* rhs_padded_strides, const T1* rhs_data, \ const TArray* fdm_output_strides, const fast_divmod& fdm_H, const fast_divmod& fdm_C, T* output_data, size_t count); #define SPECIALIZED_BINARY_ELEMENTWISE_IMPL_T2(x, T, T1, T2) \ - template void ImplT2_##x(int32_t output_rank, \ + template void ImplT2_##x(cudaStream_t stream, \ + int32_t output_rank, \ const TArray* lhs_padded_strides, const T1* lhs_data, \ const TArray* rhs_padded_strides, const T2* rhs_data, \ const TArray* fdm_output_strides, const fast_divmod& fdm_H, const fast_divmod& fdm_C, T* output_data, size_t count); diff --git a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.h b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.h index dbc7e89a03..c9a8c0f1d3 100644 --- a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.h +++ b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.h @@ -34,6 +34,7 @@ namespace cuda { #define BINARY_ELEMENTWISE_IMPL_DECLARATION(name) \ template \ void Impl_##name( \ + cudaStream_t stream, \ int32_t output_rank_or_simple_broadcast, \ const TArray* lhs_padded_strides, \ const T* lhs_data, \ @@ -52,6 +53,7 @@ BINARY_OPS() #define BINARY_ELEMENTWISE_IMPL_DECLARATION_T1(name) \ template \ void ImplT1_##name( \ + cudaStream_t stream, \ int32_t output_rank_or_simple_broadcast, \ const TArray* lhs_padded_strides, \ const T* lhs_data, \ @@ -68,6 +70,7 @@ BINARY_ELEMENTWISE_IMPL_DECLARATION_T1(Pow); #define BINARY_ELEMENTWISE_IMPL_DECLARATION_T2(name) \ template \ void ImplT2_##name( \ + cudaStream_t stream, \ int32_t output_rank_or_simple_broadcast, \ const TArray* lhs_padded_strides, \ const T1* lhs_data, \ diff --git a/onnxruntime/core/providers/cuda/math/clip.cc b/onnxruntime/core/providers/cuda/math/clip.cc index 6189f703f2..c6d41f4cb7 100644 --- a/onnxruntime/core/providers/cuda/math/clip.cc +++ b/onnxruntime/core/providers/cuda/math/clip.cc @@ -62,7 +62,7 @@ Status Clip_6::ComputeInternal(OpKernelContext* ctx) const { if (count > 0) { auto* y_data = Y->template MutableData(); const auto* x_data = X.template Data(); - ClipImpl(x_data, y_data, this->min_, this->max_, count); + ClipImpl(Stream(), x_data, y_data, this->min_, this->max_, count); } return Status::OK(); } @@ -91,7 +91,7 @@ struct LowMax { template struct Clip::ComputeImpl { - void operator()(const Tensor* X, const Tensor* min, const Tensor* max, Tensor* Y) const { + void operator()(cudaStream_t stream, const Tensor* X, const Tensor* min, const Tensor* max, Tensor* Y) const { auto min_val = clip_internal::LowMax::low(); auto max_val = clip_internal::LowMax::max(); @@ -110,7 +110,7 @@ struct Clip::ComputeImpl { if (count > 0) { auto* y_data = Y->template MutableData(); const auto* x_data = X->template Data(); - ClipImpl(x_data, y_data, min_val, max_val, count); + ClipImpl(stream, x_data, y_data, min_val, max_val, count); } } }; @@ -124,7 +124,7 @@ Status Clip::ComputeInternal(OpKernelContext* ctx) const { utils::MLTypeCallDispatcher t_disp(X->GetElementType()); - t_disp.Invoke(X, min, max, Y); + t_disp.Invoke(Stream(), X, min, max, Y); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/math/clip_impl.cu b/onnxruntime/core/providers/cuda/math/clip_impl.cu index 3af9283bd8..4164038972 100644 --- a/onnxruntime/core/providers/cuda/math/clip_impl.cu +++ b/onnxruntime/core/providers/cuda/math/clip_impl.cu @@ -13,24 +13,24 @@ __global__ void _Clip(const T* input, T* output, T min, T max, size_t N) { } template -void ClipImpl(const T* input_data, T* output_data, T min, T max, size_t count) { +void ClipImpl(cudaStream_t stream, const T* input_data, T* output_data, T min, T max, size_t count) { typedef typename ToCudaType::MappedType CudaT; int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); - _Clip<<>>(reinterpret_cast(input_data), + _Clip<<>>(reinterpret_cast(input_data), reinterpret_cast(output_data), *reinterpret_cast(&min), *reinterpret_cast(&max), count); } -template void ClipImpl(const float* input_data, float* output_data, float min, float max, size_t count); -template void ClipImpl(const double* input_data, double* output_data, double min, double max, size_t count); -template void ClipImpl(const MLFloat16* input_data, MLFloat16* output_data, MLFloat16 min, MLFloat16 max, size_t count); -template void ClipImpl(const int8_t* input_data, int8_t* output_data, int8_t min, int8_t max, size_t count); -template void ClipImpl(const uint8_t* input_data, uint8_t* output_data, uint8_t min, uint8_t max, size_t count); -template void ClipImpl(const int64_t* input_data, int64_t* output_data, int64_t min, int64_t max, size_t count); -template void ClipImpl(const uint64_t* input_data, uint64_t* output_data, uint64_t min, uint64_t max, size_t count); +template void ClipImpl(cudaStream_t stream, const float* input_data, float* output_data, float min, float max, size_t count); +template void ClipImpl(cudaStream_t stream, const double* input_data, double* output_data, double min, double max, size_t count); +template void ClipImpl(cudaStream_t stream, const MLFloat16* input_data, MLFloat16* output_data, MLFloat16 min, MLFloat16 max, size_t count); +template void ClipImpl(cudaStream_t stream, const int8_t* input_data, int8_t* output_data, int8_t min, int8_t max, size_t count); +template void ClipImpl(cudaStream_t stream, const uint8_t* input_data, uint8_t* output_data, uint8_t min, uint8_t max, size_t count); +template void ClipImpl(cudaStream_t stream, const int64_t* input_data, int64_t* output_data, int64_t min, int64_t max, size_t count); +template void ClipImpl(cudaStream_t stream, const uint64_t* input_data, uint64_t* output_data, uint64_t min, uint64_t max, size_t count); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/math/clip_impl.h b/onnxruntime/core/providers/cuda/math/clip_impl.h index b497a44888..6aefa7b90b 100644 --- a/onnxruntime/core/providers/cuda/math/clip_impl.h +++ b/onnxruntime/core/providers/cuda/math/clip_impl.h @@ -10,7 +10,7 @@ namespace onnxruntime { namespace cuda { template -void ClipImpl(const T* input_data, T* output_data, T min, T max, size_t count); +void ClipImpl(cudaStream_t stream, const T* input_data, T* output_data, T min, T max, size_t count); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/math/cumsum.cc b/onnxruntime/core/providers/cuda/math/cumsum.cc index e0daf9e980..9541eec6da 100644 --- a/onnxruntime/core/providers/cuda/math/cumsum.cc +++ b/onnxruntime/core/providers/cuda/math/cumsum.cc @@ -77,7 +77,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const { fast_divmod fast_divmod_input_stride_along_axis(static_cast(input_stride_along_axis)); if (input->IsDataType()) { - CumSumImpl(reinterpret_cast::MappedType*>(input->Data()), + CumSumImpl(Stream(), reinterpret_cast::MappedType*>(input->Data()), fast_divmod_input_dim_along_axis, fast_divmod_input_stride_along_axis, reinterpret_cast::MappedType*>(output.MutableData()), @@ -85,7 +85,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const { exclusive_, reverse_); } else if (input->IsDataType()) { - CumSumImpl(reinterpret_cast::MappedType*>(input->Data()), + CumSumImpl(Stream(), reinterpret_cast::MappedType*>(input->Data()), fast_divmod_input_dim_along_axis, fast_divmod_input_stride_along_axis, reinterpret_cast::MappedType*>(output.MutableData()), @@ -93,7 +93,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const { exclusive_, reverse_); } else if (input->IsDataType()) { - CumSumImpl(reinterpret_cast::MappedType*>(input->Data()), + CumSumImpl(Stream(), reinterpret_cast::MappedType*>(input->Data()), fast_divmod_input_dim_along_axis, fast_divmod_input_stride_along_axis, reinterpret_cast::MappedType*>(output.MutableData()), @@ -101,7 +101,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const { exclusive_, reverse_); } else if (input->IsDataType()) { - CumSumImpl(reinterpret_cast::MappedType*>(input->Data()), + CumSumImpl(Stream(), reinterpret_cast::MappedType*>(input->Data()), fast_divmod_input_dim_along_axis, fast_divmod_input_stride_along_axis, reinterpret_cast::MappedType*>(output.MutableData()), @@ -109,7 +109,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const { exclusive_, reverse_); } else if (input->IsDataType()) { - CumSumImpl(reinterpret_cast::MappedType*>(input->Data()), + CumSumImpl(Stream(), reinterpret_cast::MappedType*>(input->Data()), fast_divmod_input_dim_along_axis, fast_divmod_input_stride_along_axis, reinterpret_cast::MappedType*>(output.MutableData()), @@ -117,7 +117,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const { exclusive_, reverse_); } else if (input->IsDataType()) { - CumSumImpl(reinterpret_cast::MappedType*>(input->Data()), + CumSumImpl(Stream(), reinterpret_cast::MappedType*>(input->Data()), fast_divmod_input_dim_along_axis, fast_divmod_input_stride_along_axis, reinterpret_cast::MappedType*>(output.MutableData()), @@ -125,7 +125,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const { exclusive_, reverse_); } else if (input->IsDataType()) { - CumSumImpl(reinterpret_cast::MappedType*>(input->Data()), + CumSumImpl(Stream(), reinterpret_cast::MappedType*>(input->Data()), fast_divmod_input_dim_along_axis, fast_divmod_input_stride_along_axis, reinterpret_cast::MappedType*>(output.MutableData()), diff --git a/onnxruntime/core/providers/cuda/math/cumsum_impl.cu b/onnxruntime/core/providers/cuda/math/cumsum_impl.cu index 901bf8d2be..8a657dd9dc 100644 --- a/onnxruntime/core/providers/cuda/math/cumsum_impl.cu +++ b/onnxruntime/core/providers/cuda/math/cumsum_impl.cu @@ -71,6 +71,7 @@ __global__ void _CumSumKernel( template void CumSumImpl( + cudaStream_t stream, const T* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, @@ -81,7 +82,7 @@ void CumSumImpl( if (output_size > 0) { int blocksPerGrid = static_cast((output_size + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock); - _CumSumKernel<<>>(input_data, + _CumSumKernel<<>>(input_data, input_dim_along_axis, input_stride_along_axis, output_data, @@ -92,6 +93,7 @@ void CumSumImpl( } template void CumSumImpl( + cudaStream_t stream, const int32_t* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, @@ -101,6 +103,7 @@ template void CumSumImpl( bool reverse); template void CumSumImpl( + cudaStream_t stream, const int64_t* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, @@ -110,6 +113,7 @@ template void CumSumImpl( bool reverse); template void CumSumImpl( + cudaStream_t stream, const uint32_t* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, @@ -119,6 +123,7 @@ template void CumSumImpl( bool reverse); template void CumSumImpl( + cudaStream_t stream, const uint64_t* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, @@ -128,6 +133,7 @@ template void CumSumImpl( bool reverse); template void CumSumImpl( + cudaStream_t stream, const float* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, @@ -137,6 +143,7 @@ template void CumSumImpl( bool reverse); template void CumSumImpl( + cudaStream_t stream, const double* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, @@ -146,6 +153,7 @@ template void CumSumImpl( bool reverse); template void CumSumImpl( + cudaStream_t stream, const half* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, diff --git a/onnxruntime/core/providers/cuda/math/cumsum_impl.h b/onnxruntime/core/providers/cuda/math/cumsum_impl.h index f64a863ec9..ad77f748b0 100644 --- a/onnxruntime/core/providers/cuda/math/cumsum_impl.h +++ b/onnxruntime/core/providers/cuda/math/cumsum_impl.h @@ -11,6 +11,7 @@ namespace cuda { template void CumSumImpl( + cudaStream_t stream, const T* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, diff --git a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc index b1da3135f9..4d3fd9d83b 100644 --- a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc +++ b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc @@ -12,14 +12,15 @@ namespace DeviceHelpers { namespace CudaDeviceHelpers { // CUDA EP specific Data copy helper -Status DataCopy(const Tensor& input, Tensor& output) { +Status DataCopy(const Tensor& input, Tensor& output, void* einsum_cuda_assets) { ORT_ENFORCE(output.SizeInBytes() == input.SizeInBytes(), "Einsum op: The candidate output does not match the actual output's shape"); // There are no string tensors in Einsum's case - so safely use memcpy // TODO: Currently, triggers copy on stream 0, investigate if we can still do that // *if* the kernel is launched in a different stream CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output.MutableDataRaw(), input.DataRaw(), input.SizeInBytes(), - cudaMemcpyDeviceToDevice)); + cudaMemcpyDeviceToDevice, + static_cast(static_cast(einsum_cuda_assets)->cuda_ep_->GetComputeStream()))); return Status::OK(); } @@ -28,6 +29,7 @@ Status DataCopy(const Tensor& input, Tensor& output) { Status Transpose(const std::vector& permutation, const Tensor& input, Tensor& output, const TensorShape* input_shape_override, void* einsum_cuda_assets) { return cuda::Transpose::DoTranspose(static_cast(einsum_cuda_assets)->cuda_ep_->GetDeviceProp(), + static_cast(static_cast(einsum_cuda_assets)->cuda_ep_->GetComputeStream()), static_cast(einsum_cuda_assets)->cublas_handle_, permutation, input, output, input_shape_override); } @@ -79,7 +81,7 @@ Tensor ReduceSum(const Tensor& input, const std::vector& reduce_axes, } // CUDA EP specific Diagonal helper -std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator) { +std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator, void* einsum_cuda_assets) { const auto& input_shape = input.Shape(); const auto& input_dims = input_shape.GetDims(); auto rank = static_cast(input_dims.size()); @@ -117,6 +119,7 @@ std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim } DiagonalImpl( + static_cast(static_cast(einsum_cuda_assets)->cuda_ep_->GetComputeStream()), input.DataRaw(), input.Shape().GetDims().size(), first_dim, diff --git a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.h b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.h index 90aa863a87..797f8f301e 100644 --- a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.h +++ b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.h @@ -38,7 +38,7 @@ namespace CudaDeviceHelpers { Status Transpose(const std::vector& permutation, const Tensor& input, Tensor& output, const TensorShape* input_shape_override, void* einsum_cuda_assets); -Status DataCopy(const Tensor& input, Tensor& output); +Status DataCopy(const Tensor& input, Tensor& output, void* einsum_cuda_assets); template Status MatMul(const T* input_1_data, const T* input_2_data, T* output_data, @@ -52,7 +52,7 @@ Tensor ReduceSum(const Tensor& input, const std::vector& reduce_axes, const TensorShape* input_shape_override, concurrency::ThreadPool* /*tp*/, void* einsum_cuda_assets); -std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator); +std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator, void* einsum_cuda_assets); } // namespace CudaDeviceHelpers diff --git a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.cu b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.cu index 6b73ae3117..d84396cc21 100644 --- a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.cu +++ b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.cu @@ -47,6 +47,7 @@ __global__ void _DiagonalKernel( } void DiagonalImpl( + cudaStream_t stream, const void* input_data, const int64_t input_rank, const int64_t dim_1, @@ -61,14 +62,14 @@ void DiagonalImpl( switch (element_size) { case sizeof(int32_t): - _DiagonalKernel<<>>( + _DiagonalKernel<<>>( reinterpret_cast::MappedType*>(input_data), input_rank, dim_1, dim_2, input_strides, reinterpret_cast::MappedType*>(output_data), output_strides, output_size); break; case sizeof(int64_t): - _DiagonalKernel<<>>( + _DiagonalKernel<<>>( reinterpret_cast::MappedType*>(input_data), input_rank, dim_1, dim_2, input_strides, reinterpret_cast::MappedType*>(output_data), output_strides, output_size); diff --git a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.h b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.h index 483978e663..f0d8416809 100644 --- a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.h +++ b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.h @@ -10,6 +10,7 @@ namespace onnxruntime { namespace cuda { void DiagonalImpl( + cudaStream_t stream, const void* input_data, const int64_t input_rank, const int64_t dim_1, diff --git a/onnxruntime/core/providers/cuda/math/gemm.cc b/onnxruntime/core/providers/cuda/math/gemm.cc index 03819891e9..79eeb5eaf2 100644 --- a/onnxruntime/core/providers/cuda/math/gemm.cc +++ b/onnxruntime/core/providers/cuda/math/gemm.cc @@ -86,6 +86,7 @@ Status Gemm::ComputeInternal(OpKernelContext* ctx) const { if (b_shape.Size() == 1) { // if B is (), (1,) or (1, 1), broadcast the scalar CUBLAS_RETURN_IF_ERROR(cublasCopyHelper( + Stream(), CublasHandle(), M * N, b_data, @@ -118,7 +119,7 @@ Status Gemm::ComputeInternal(OpKernelContext* ctx) const { out_data, N, device_prop)); } else { // B is (M, N), no broadcast needed. - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(out_data, b_data, M * N * sizeof(T), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(out_data, b_data, M * N * sizeof(T), cudaMemcpyDeviceToDevice, Stream())); } } diff --git a/onnxruntime/core/providers/cuda/math/matmul_integer.cc b/onnxruntime/core/providers/cuda/math/matmul_integer.cc index 5c7cb81320..89e738fca0 100644 --- a/onnxruntime/core/providers/cuda/math/matmul_integer.cc +++ b/onnxruntime/core/providers/cuda/math/matmul_integer.cc @@ -70,19 +70,20 @@ Status MatMulInteger::ComputeInternal(OpKernelContext* ctx) cons IAllocatorUniquePtr a_row_buf; if (b_offset != 0) { a_row_buf = GetScratchBuffer(helper.OutputShape().Size() / helper.N()); - ORT_RETURN_IF_ERROR(ReduceRowSumOnMatrixA(a_ptr, a_row_buf.get(), b_offset, helper)); + ORT_RETURN_IF_ERROR(ReduceRowSumOnMatrixA(Stream(), a_ptr, a_row_buf.get(), b_offset, helper)); } IAllocatorUniquePtr b_col_buf; if (a_offset != 0) { b_col_buf = GetScratchBuffer(helper.OutputShape().Size() / helper.M()); - ORT_RETURN_IF_ERROR(ReduceColSumOnMatrixB(b_ptr, b_col_buf.get(), a_offset, helper)); + ORT_RETURN_IF_ERROR(ReduceColSumOnMatrixB(Stream(), b_ptr, b_col_buf.get(), a_offset, helper)); } int alpha = 1; int beta = 0; if (a_offset != 0 || b_offset != 0) { - OffsetOutput(a_row_buf.get(), + OffsetOutput(Stream(), + a_row_buf.get(), b_col_buf.get(), output_ptr, a_offset, diff --git a/onnxruntime/core/providers/cuda/math/matmul_integer.cu b/onnxruntime/core/providers/cuda/math/matmul_integer.cu index 267cf198c9..f6a9d6488b 100644 --- a/onnxruntime/core/providers/cuda/math/matmul_integer.cu +++ b/onnxruntime/core/providers/cuda/math/matmul_integer.cu @@ -26,9 +26,9 @@ __global__ void ReduceRowSumOnMatrixAKernel(const int8_t* matrix, int32_t* row_s } } -Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper) { +Status ReduceRowSumOnMatrixA(cudaStream_t stream, const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper) { for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) { - ReduceRowSumOnMatrixAKernel(GridDim::maxThreadsPerBlock)><<(helper.M()), GridDim::maxThreadsPerBlock, 0>>>(matrix + helper.LeftOffsets()[batch], + ReduceRowSumOnMatrixAKernel(GridDim::maxThreadsPerBlock)><<(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(matrix + helper.LeftOffsets()[batch], row_sum + batch * helper.M(), offset, static_cast(helper.K())); @@ -54,9 +54,9 @@ __global__ void ReduceColSumOnMatrixBKernel(const int8_t* matrix, int32_t* col_s } } -Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper) { +Status ReduceColSumOnMatrixB(cudaStream_t stream, const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper) { for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) { - ReduceColSumOnMatrixBKernel(GridDim::maxThreadsPerBlock)><<(helper.N()), GridDim::maxThreadsPerBlock, 0>>>(matrix + helper.RightOffsets()[batch], + ReduceColSumOnMatrixBKernel(GridDim::maxThreadsPerBlock)><<(helper.N()), GridDim::maxThreadsPerBlock, 0, stream>>>(matrix + helper.RightOffsets()[batch], col_sum + batch * helper.N(), offset, static_cast(helper.K()), @@ -92,7 +92,8 @@ __global__ void ComputeOffsetOfMatrixB(const int32_t* row_sum, } } -Status OffsetOutput(const int32_t* row_sum, +Status OffsetOutput(cudaStream_t stream, + const int32_t* row_sum, const int32_t* col_sum, int32_t* output, const int8_t a_offset, @@ -100,7 +101,7 @@ Status OffsetOutput(const int32_t* row_sum, const MatMulComputeHelper& helper) { if (a_offset && b_offset) { for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) { - ComputeOffsetOfMatrixAB<<(helper.M()), GridDim::maxThreadsPerBlock, 0>>>( + ComputeOffsetOfMatrixAB<<(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>( row_sum + batch * helper.M(), col_sum + batch * helper.N(), output + helper.OutputOffsets()[batch], @@ -109,14 +110,14 @@ Status OffsetOutput(const int32_t* row_sum, } } else if (a_offset) { for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) { - ComputeOffsetOfMatrixA<<(helper.M()), GridDim::maxThreadsPerBlock, 0>>>( + ComputeOffsetOfMatrixA<<(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>( col_sum + batch * helper.N(), output + helper.OutputOffsets()[batch], static_cast(helper.N())); } } else if (b_offset) { for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) { - ComputeOffsetOfMatrixB<<(helper.M()), GridDim::maxThreadsPerBlock, 0>>>( + ComputeOffsetOfMatrixB<<(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>( row_sum + batch * helper.M(), output + helper.OutputOffsets()[batch], static_cast(helper.N())); diff --git a/onnxruntime/core/providers/cuda/math/matmul_integer.cuh b/onnxruntime/core/providers/cuda/math/matmul_integer.cuh index e22bbf4d24..e6dc24fc08 100644 --- a/onnxruntime/core/providers/cuda/math/matmul_integer.cuh +++ b/onnxruntime/core/providers/cuda/math/matmul_integer.cuh @@ -11,9 +11,10 @@ namespace onnxruntime { namespace cuda { -Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper); -Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper); -Status OffsetOutput(const int32_t* row_sum, +Status ReduceRowSumOnMatrixA(cudaStream_t stream, const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper); +Status ReduceColSumOnMatrixB(cudaStream_t stream, const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper); +Status OffsetOutput(cudaStream_t stream, + const int32_t* row_sum, const int32_t* col_sum, int32_t* output, const int8_t a_offset, diff --git a/onnxruntime/core/providers/cuda/math/softmax.cc b/onnxruntime/core/providers/cuda/math/softmax.cc index 09753b66a2..7efeb6425e 100644 --- a/onnxruntime/core/providers/cuda/math/softmax.cc +++ b/onnxruntime/core/providers/cuda/math/softmax.cc @@ -13,6 +13,7 @@ namespace cuda { template Status SoftMaxComputeHelper( + cudaStream_t stream, const T* X, const TensorShape& input_shape, T* Y, @@ -28,7 +29,7 @@ Status SoftMaxComputeHelper( // cudnnSoftmaxForward/Backward is not optimal implementation. // TODO: remove cudnn path completely in the future. if (D <= 1024 && D * sizeof(T) <= 4096) { - dispatch_softmax_forward, is_log_softmax>(Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); + dispatch_softmax_forward, is_log_softmax>(stream, Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); return Status::OK(); } @@ -50,8 +51,8 @@ Status SoftMaxComputeHelper( } #define SPECIALIZED_SOFTMAX_HELPER_IMPL(T) \ - template Status SoftMaxComputeHelper(const T* input, const TensorShape& shape, T* Y, cudnnHandle_t handle, int64_t axis); \ - template Status SoftMaxComputeHelper(const T* input, const TensorShape& shape, T* Y, cudnnHandle_t handle, int64_t axis); + template Status SoftMaxComputeHelper(cudaStream_t stream, const T* input, const TensorShape& shape, T* Y, cudnnHandle_t handle, int64_t axis); \ + template Status SoftMaxComputeHelper(cudaStream_t stream, const T* input, const TensorShape& shape, T* Y, cudnnHandle_t handle, int64_t axis); SPECIALIZED_SOFTMAX_HELPER_IMPL(float) SPECIALIZED_SOFTMAX_HELPER_IMPL(double) @@ -62,6 +63,7 @@ SPECIALIZED_SOFTMAX_HELPER_IMPL(MLFloat16) #define SPECIALIZED_SOFTMAX_HELPER_IMPL_BFloat16(is_log_softmax) \ template <> \ Status SoftMaxComputeHelper( \ + cudaStream_t stream, \ const BFloat16* X, \ const TensorShape& input_shape, \ BFloat16* Y, \ @@ -73,7 +75,7 @@ SPECIALIZED_SOFTMAX_HELPER_IMPL(MLFloat16) auto Y_data = reinterpret_cast(Y); \ auto X_data = reinterpret_cast(X); \ dispatch_softmax_forward, is_log_softmax>( \ - Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); \ + stream, Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); \ return Status::OK(); \ } @@ -183,6 +185,7 @@ Status Softmax::ComputeInternal(OpKernelContext* ctx) const { // Perform the transpose ORT_RETURN_IF_ERROR(Transpose::DoTranspose(cuda_ep_->GetDeviceProp(), + Stream(), CublasHandle(), permutation, *X, temp_input)); transposed_input = std::move(temp_input); @@ -208,11 +211,11 @@ Status Softmax::ComputeInternal(OpKernelContext* ctx) const { Status status; if (log_softmax_) { - status = SoftMaxComputeHelper(X_data, *compute_input_shape, Y_data, CudnnHandle(), + status = SoftMaxComputeHelper(Stream(), X_data, *compute_input_shape, Y_data, CudnnHandle(), is_transpose_required ? static_cast(rank) - 1 : static_cast(axis)); } else { - status = SoftMaxComputeHelper(X_data, *compute_input_shape, Y_data, CudnnHandle(), + status = SoftMaxComputeHelper(Stream(), X_data, *compute_input_shape, Y_data, CudnnHandle(), is_transpose_required ? static_cast(rank) - 1 : static_cast(axis)); } @@ -227,6 +230,7 @@ Status Softmax::ComputeInternal(OpKernelContext* ctx) const { } // Perform the transpose to get the axes back to the original ordering ORT_RETURN_IF_ERROR(Transpose::DoTranspose(cuda_ep_->GetDeviceProp(), + Stream(), CublasHandle(), reverse_permutation, intermediate_output, *Y)); } diff --git a/onnxruntime/core/providers/cuda/math/softmax.h b/onnxruntime/core/providers/cuda/math/softmax.h index 772920de88..3af26690dc 100644 --- a/onnxruntime/core/providers/cuda/math/softmax.h +++ b/onnxruntime/core/providers/cuda/math/softmax.h @@ -11,6 +11,7 @@ namespace cuda { template Status SoftMaxComputeHelper( + cudaStream_t stream, const T* input, const TensorShape& shape, T* Y, @@ -18,7 +19,7 @@ Status SoftMaxComputeHelper( int64_t axis); template -void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); +void dispatch_softmax_forward(cudaStream_t stream, output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); template class Softmax final : public CudaKernel { diff --git a/onnxruntime/core/providers/cuda/math/softmax_impl.cu b/onnxruntime/core/providers/cuda/math/softmax_impl.cu index f4658e93fe..80a680963f 100644 --- a/onnxruntime/core/providers/cuda/math/softmax_impl.cu +++ b/onnxruntime/core/providers/cuda/math/softmax_impl.cu @@ -135,7 +135,7 @@ __global__ void softmax_warp_forward(output_t* dst, const input_t* src, int batc } template -void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count) { +void dispatch_softmax_forward(cudaStream_t stream, output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count) { if (softmax_elements == 0) { return; } else { @@ -159,47 +159,47 @@ void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_ele switch (log2_elements) { case 0: // 1 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 1: // 2 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 2: // 4 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 3: // 8 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 4: // 16 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 5: // 32 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 6: // 64 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 7: // 128 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 8: // 256 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 9: // 512 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 10: // 1024 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; default: break; @@ -208,8 +208,8 @@ void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_ele } #define SPECIALIZED_SOFTMAX_IMPL(input_t, output_t, acc_t) \ -template void dispatch_softmax_forward(output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); \ -template void dispatch_softmax_forward(output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); +template void dispatch_softmax_forward(cudaStream_t stream, output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); \ +template void dispatch_softmax_forward(cudaStream_t stream, output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); SPECIALIZED_SOFTMAX_IMPL(float, float, float) SPECIALIZED_SOFTMAX_IMPL(half, half, float) diff --git a/onnxruntime/core/providers/cuda/math/topk_impl.cu b/onnxruntime/core/providers/cuda/math/topk_impl.cu index 2ba3ac4086..db609b9aa9 100644 --- a/onnxruntime/core/providers/cuda/math/topk_impl.cu +++ b/onnxruntime/core/providers/cuda/math/topk_impl.cu @@ -419,23 +419,24 @@ __global__ void ExcludeOutput(int64_t* output_i, int64_t K, int64_t dimension) { template Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t* output_i, const TArray& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension) { typedef typename ToCudaType::MappedType CudaT; + cudaStream_t stream = kernel->Stream(); const CudaT* input_x_ptr = reinterpret_cast(input_x); CudaT* output_v_ptr = reinterpret_cast(output_v); auto aligned_K = ALIGN(K); auto aligned_dimension = ALIGN(dimension); if (aligned_dimension <= GridDim::maxThreadsPerBlock) { - BitonicTopK<<)>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension, aligned_dimension, NumericLimits::Lowest(), NumericLimits::Max()); + BitonicTopK<<), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension, aligned_dimension, NumericLimits::Lowest(), NumericLimits::Max()); } else if (K <= BT*16 || 0 == sorted) { auto XPT = static_cast(ceil(static_cast(dimension) / GridDim::maxThreadsPerBlock)); if (BT*2 >= K || 0 == sorted) { - RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Lowest(), NumericLimits::Max()); + RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Lowest(), NumericLimits::Max()); } else if (BT*4>=K) { - RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Lowest(), NumericLimits::Max()); + RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Lowest(), NumericLimits::Max()); } else if (BT*8>=K) { - RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Lowest(), NumericLimits::Max()); + RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Lowest(), NumericLimits::Max()); } else { - RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Lowest(), NumericLimits::Max()); + RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Lowest(), NumericLimits::Max()); } } else { auto input_key_buffer = kernel->GetScratchBuffer(dimension); @@ -447,21 +448,21 @@ Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t auto* input_value = input_value_buffer.get(); auto* output_value = output_value_buffer.get(); size_t temp_bytes = 0; - CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, input_key, output_key, input_value, output_value, dimension)); + CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, input_key, output_key, input_value, output_value, dimension, 0, sizeof(T)*8, stream)); auto temp_storage_buffer = kernel->GetScratchBuffer(temp_bytes); auto* temp_storage = temp_storage_buffer.get(); auto blocks_per_grid_D = (int)(ceil(static_cast(dimension) / BT)); auto blocks_per_grid_K = (int)(ceil(static_cast(K) / BT)); for (int64_t i = 0; i < N; i++) { - FillInput<<>>(input_x_ptr, input_key, input_value, elem_nums, size, axis, K, i, dimension); - CUDA_RETURN_IF_ERROR(1 == largest ? cub::DeviceRadixSort::SortPairsDescending(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension) - : cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension)); + FillInput<<>>(input_x_ptr, input_key, input_value, elem_nums, size, axis, K, i, dimension); + CUDA_RETURN_IF_ERROR(1 == largest ? cub::DeviceRadixSort::SortPairsDescending(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension, 0, sizeof(T)*8, stream) + : cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension, 0, sizeof(T)*8, stream)); if (1 == sorted) { - FillOutput<<>>(output_key, output_value, output_v_ptr, output_i, elem_nums, size, axis, K, i, dimension); + FillOutput<<>>(output_key, output_value, output_v_ptr, output_i, elem_nums, size, axis, K, i, dimension); } else { //reorder by ascending index - ExcludeOutput<<>>(output_value, K, dimension); - CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, output_value, input_value, output_key, input_key, dimension)); - FillOutput<<>>(input_key, input_value, output_v_ptr, output_i, elem_nums, size, axis, K, i, dimension); + ExcludeOutput<<>>(output_value, K, dimension); + CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, output_value, input_value, output_key, input_key, dimension, 0, sizeof(T)*8, stream)); + FillOutput<<>>(input_key, input_value, output_v_ptr, output_i, elem_nums, size, axis, K, i, dimension); } } } diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc index deae967c84..53220ae131 100644 --- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc +++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc @@ -50,6 +50,7 @@ Status UnaryElementwise::Prepare(OpKernelContext* context, UnaryElementwisePrepa UnaryElementwisePreparation p; \ ORT_RETURN_IF_ERROR(UnaryElementwise::Prepare(context, &p)); \ Impl_##x( \ + Stream(), \ reinterpret_cast::MappedType*>(p.input_tensor->template Data()), \ reinterpret_cast::MappedType*>(p.output_tensor->template MutableData()), \ p.output_tensor->Shape().Size()); \ diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu index fe60e66856..5b5102938d 100644 --- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu +++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu @@ -19,14 +19,15 @@ namespace cuda { #define UNARY_ELEMENTWISE_IMPL(name) \ UNARY_ELEMENTWISE_IMPL_DECLARATION(name) { \ - UnaryElementWiseImpl(input_data, \ + UnaryElementWiseImpl(stream, \ + input_data, \ output_data, \ OP_##name(), \ count); \ } #define SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, T) \ - template void Impl_##name(const T* input_data, T* output_data, size_t count); + template void Impl_##name(cudaStream_t stream, const T* input_data, T* output_data, size_t count); #define UNARY_OP_NAME_EXPR(name, expr) \ OP(name, expr) \ @@ -116,17 +117,19 @@ struct OP_Cast { template void Impl_Cast( + cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count) { - UnaryElementWiseImpl(input_data, + UnaryElementWiseImpl(stream, + input_data, output_data, OP_Cast(), count); } #define SPECIALIZED_CAST_IMPL2(InT, OutT) \ - template void Impl_Cast(const InT* input_data, OutT* output_data, size_t count); + template void Impl_Cast(cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count); #if CUDA_VERSION >= 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) #define SPECIALIZED_CAST_IMPL2_BF16(T) SPECIALIZED_CAST_IMPL2(T, nv_bfloat16) diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h index 81123c46bf..2b28886386 100644 --- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h +++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h @@ -29,6 +29,7 @@ namespace cuda { #define UNARY_ELEMENTWISE_IMPL_DECLARATION(name) \ template \ void Impl_##name( \ + cudaStream_t stream, \ const T* input_data, \ T* output_data, \ size_t count) @@ -39,6 +40,7 @@ UNARY_OPS() template void Impl_Cast( + cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count); diff --git a/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops.cc index beb3e829f3..7db97d0ed1 100644 --- a/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops.cc +++ b/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops.cc @@ -17,7 +17,7 @@ namespace cuda { template template Status VariadicElementwiseOp:: - NoBroadcastBatchImplDispatchTarget::operator()(const InputTensorVector& inputs, Tensor& output) const { + NoBroadcastBatchImplDispatchTarget::operator()(cudaStream_t stream, const InputTensorVector& inputs, Tensor& output) const { assert(inputs.size() > 1); using CudaT = typename ToCudaType::MappedType; @@ -30,7 +30,7 @@ Status VariadicElementwiseOp CudaT* output_data = reinterpret_cast(output.template MutableData()); Impl_NoBroadcastInputBatch( - input_data_batch, output_data, output.Shape().Size()); + stream, input_data_batch, output_data, output.Shape().Size()); return Status::OK(); } @@ -39,13 +39,14 @@ Status VariadicElementwiseOp template template Status VariadicElementwiseOp:: - BinaryImplDispatchTarget::operator()(const Tensor& lhs, const Tensor& rhs, Tensor& output) const { + BinaryImplDispatchTarget::operator()(cudaStream_t stream, const Tensor& lhs, const Tensor& rhs, Tensor& output) const { using CudaT = typename ToCudaType::MappedType; BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(BinaryElementwiseBroadcastPrepare(&lhs, &rhs, &output, &prepare)); Impl_General( + stream, prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast(prepare.lhs_tensor->template Data()), @@ -64,17 +65,18 @@ Status VariadicElementwiseOp template template Status VariadicElementwiseOp:: - GeneralImplDispatchTarget::operator()(const InputTensorVector& inputs, Tensor& output) const { + GeneralImplDispatchTarget::operator()(cudaStream_t stream, const InputTensorVector& inputs, Tensor& output) const { assert(inputs.size() > 1); using CudaT = typename ToCudaType::MappedType; - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output.MutableDataRaw(), 0, output.SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output.MutableDataRaw(), 0, output.SizeInBytes(), stream)); BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(BinaryElementwiseBroadcastPrepare(&output, &inputs[0].get(), &output, &prepare)); Impl_Add( + stream, prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast(prepare.lhs_tensor->template Data()), @@ -90,6 +92,7 @@ Status VariadicElementwiseOp ORT_RETURN_IF_ERROR(BinaryElementwiseBroadcastPrepare(&output, &inputs[index].get(), &output, &prepare)); Impl_General( + stream, prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast(prepare.lhs_tensor->template Data()), @@ -132,7 +135,7 @@ Status VariadicElementwiseOp if (first_input_tensor.DataRaw() != output_tensor.DataRaw()) { CUDA_RETURN_IF_ERROR(cudaMemcpyAsync( output_tensor.MutableDataRaw(), first_input_tensor.DataRaw(), first_input_tensor.SizeInBytes(), - cudaMemcpyDeviceToDevice)); + cudaMemcpyDeviceToDevice, Stream())); } return Status::OK(); @@ -152,14 +155,14 @@ Status VariadicElementwiseOp // special case for no broadcasting and 2 inputs if (input_count == 2) { utils::MLTypeCallDispatcherRet dispatcher(element_type); - ORT_RETURN_IF_ERROR(dispatcher.Invoke(input_tensors[0], input_tensors[1], output_tensor)); + ORT_RETURN_IF_ERROR(dispatcher.Invoke(Stream(), input_tensors[0], input_tensors[1], output_tensor)); return Status::OK(); } utils::MLTypeCallDispatcherRet dispatcher( element_type); - ORT_RETURN_IF_ERROR(dispatcher.Invoke(input_tensors, output_tensor)); + ORT_RETURN_IF_ERROR(dispatcher.Invoke(Stream(), input_tensors, output_tensor)); return Status::OK(); } @@ -177,7 +180,7 @@ Status VariadicElementwiseOp // special case for 2 inputs if (input_count == 2) { utils::MLTypeCallDispatcherRet dispatcher(element_type); - ORT_RETURN_IF_ERROR(dispatcher.Invoke(input_tensors[0], input_tensors[1], output_tensor)); + ORT_RETURN_IF_ERROR(dispatcher.Invoke(Stream(), input_tensors[0], input_tensors[1], output_tensor)); return Status::OK(); } @@ -186,7 +189,7 @@ Status VariadicElementwiseOp { utils::MLTypeCallDispatcherRet dispatcher( element_type); - ORT_RETURN_IF_ERROR(dispatcher.Invoke(input_tensors, output_tensor)); + ORT_RETURN_IF_ERROR(dispatcher.Invoke(Stream(), input_tensors, output_tensor)); } return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops.h b/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops.h index 101e8389cd..42d83f81ad 100644 --- a/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops.h +++ b/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops.h @@ -24,17 +24,17 @@ class VariadicElementwiseOp : public CudaKernel { template struct NoBroadcastBatchImplDispatchTarget { - Status operator()(const InputTensorVector& inputs, Tensor& output) const; + Status operator()(cudaStream_t stream, const InputTensorVector& inputs, Tensor& output) const; }; template struct BinaryImplDispatchTarget { - Status operator()(const Tensor& lhs, const Tensor& rhs, Tensor& output) const; + Status operator()(cudaStream_t stream, const Tensor& lhs, const Tensor& rhs, Tensor& output) const; }; template struct GeneralImplDispatchTarget { - Status operator()(const InputTensorVector& inputs, Tensor& output) const; + Status operator()(cudaStream_t stream, const InputTensorVector& inputs, Tensor& output) const; }; }; diff --git a/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops_impl.cu b/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops_impl.cu index ad975f85c3..da1f228e5b 100644 --- a/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops_impl.cu +++ b/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops_impl.cu @@ -20,6 +20,7 @@ struct VariadicElementwiseOpTraits; using ScalarComputeFunctor = OP_##ImplName; \ \ static void ComputeFn( \ + cudaStream_t stream, \ int32_t output_rank_or_simple_broadcast, \ const TArray* lhs_padded_strides, \ const T* lhs_data, \ @@ -31,6 +32,7 @@ struct VariadicElementwiseOpTraits; T* output_data, \ size_t count) { \ Impl_##ImplName( \ + stream, \ output_rank_or_simple_broadcast, \ lhs_padded_strides, \ lhs_data, \ @@ -52,6 +54,7 @@ DEFINE_TRAITS(variadic_elementwise_ops::Max, Max) template void Impl_General( + cudaStream_t stream, int32_t output_rank_or_simple_broadcast, const TArray* lhs_padded_strides, const T* lhs_data, @@ -63,6 +66,7 @@ void Impl_General( T* output_data, size_t count) { VariadicElementwiseOpTraits::ComputeFn( + stream, output_rank_or_simple_broadcast, lhs_padded_strides, lhs_data, @@ -77,12 +81,14 @@ void Impl_General( template void Impl_NoBroadcastInputBatch( + cudaStream_t stream, InputBatchArray input_data_batch, T* output_data, size_t count) { VariadicElementWiseNoBroadcastInputBatchImpl< T, typename VariadicElementwiseOpTraits::ScalarComputeFunctor, k_max_input_batch_size>( + stream, typename VariadicElementwiseOpTraits::ScalarComputeFunctor{}, count, input_data_batch, @@ -91,6 +97,7 @@ void Impl_NoBroadcastInputBatch( #define SPECIALIZE_IMPL(T, VariadicElementwiseOpTag) \ template void Impl_General( \ + cudaStream_t stream, \ int32_t output_rank_or_simple_broadcast, \ const TArray* lhs_padded_strides, \ const T* lhs_data, \ @@ -103,6 +110,7 @@ void Impl_NoBroadcastInputBatch( size_t count); \ \ template void Impl_NoBroadcastInputBatch( \ + cudaStream_t stream, \ InputBatchArray input_data_batch, \ T * output_data, \ size_t count); diff --git a/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops_impl.h b/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops_impl.h index 39806f0ccc..72316332b1 100644 --- a/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops_impl.h +++ b/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops_impl.h @@ -12,6 +12,7 @@ namespace cuda { template void Impl_General( + cudaStream_t stream, int32_t output_rank_or_simple_broadcast, const TArray* lhs_padded_strides, const T* lhs_data, @@ -30,6 +31,7 @@ using InputBatchArray = TArray; template void Impl_NoBroadcastInputBatch( + cudaStream_t stream, InputBatchArray input_data_batch, T* output_data, size_t count); diff --git a/onnxruntime/core/providers/cuda/multi_tensor/common.cuh b/onnxruntime/core/providers/cuda/multi_tensor/common.cuh index 6c779ad501..84cd10ad24 100644 --- a/onnxruntime/core/providers/cuda/multi_tensor/common.cuh +++ b/onnxruntime/core/providers/cuda/multi_tensor/common.cuh @@ -73,6 +73,7 @@ int compute_max_tensor_size_per_launch(int element_count_per_thread) { template void launch_multi_tensor_functor( + cudaStream_t stream, const int chunk_size, std::vector& tensor_sizes, std::vector>& grouped_tensor_pointers, @@ -121,7 +122,7 @@ void launch_multi_tensor_functor( chunk_group.chunk_count = block_index; if (block_index == chunk_group.max_block_count) { - multipleTensorKernel(chunk_group, std::forward(kernelParams)...); + multipleTensorKernel(stream, chunk_group, std::forward(kernelParams)...); block_index = 0; } } @@ -129,7 +130,7 @@ void launch_multi_tensor_functor( // After ++tensor_group_index, tensor_group_index becomes the count of tensor group in chunk_group. ++tensor_group_index; if (tensor_group_index == chunk_group.max_tensor_group_count) { - multipleTensorKernel(chunk_group, std::forward(kernelParams)...); + multipleTensorKernel(stream, chunk_group, std::forward(kernelParams)...); block_index = 0; tensor_group_index = 0; } @@ -138,7 +139,7 @@ void launch_multi_tensor_functor( // This round of processing tensor group is finished. // All the groups remain in chunk group should be processed right now. if (block_index != 0) { - multipleTensorKernel(chunk_group, std::forward(kernelParams)...); + multipleTensorKernel(stream, chunk_group, std::forward(kernelParams)...); block_index = 0; tensor_group_index = 0; } diff --git a/onnxruntime/core/providers/cuda/nn/batch_norm.cc b/onnxruntime/core/providers/cuda/nn/batch_norm.cc index a1af24a4ef..db312fdd2c 100644 --- a/onnxruntime/core/providers/cuda/nn/batch_norm.cc +++ b/onnxruntime/core/providers/cuda/nn/batch_norm.cc @@ -81,10 +81,10 @@ Status BatchNorm::ComputeInternal(OpKernelContext* p_op_kernel_context) const auto f_B = GetScratchBuffer(C); auto f_mean = GetScratchBuffer(C); auto f_var = GetScratchBuffer(C); - Impl_Cast(scale_data, f_scale.get(), C); - Impl_Cast(b_data, f_B.get(), C); - Impl_Cast(mean_data, f_mean.get(), C); - Impl_Cast(var_data, f_var.get(), C); + Impl_Cast(Stream(), scale_data, f_scale.get(), C); + Impl_Cast(Stream(), b_data, f_B.get(), C); + Impl_Cast(Stream(), mean_data, f_mean.get(), C); + Impl_Cast(Stream(), var_data, f_var.get(), C); CUDNN_RETURN_IF_ERROR(cudnnBatchNormalizationForwardInference( CudnnHandle(), diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc index 20ada62308..f61f93fab5 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.cc +++ b/onnxruntime/core/providers/cuda/nn/conv.cc @@ -34,7 +34,8 @@ REGISTER_KERNEL_TYPED(float) REGISTER_KERNEL_TYPED(double) REGISTER_KERNEL_TYPED(MLFloat16) -Status SliceOutUnwantedOutputSection(const void* input_data, +Status SliceOutUnwantedOutputSection(cudaStream_t stream, + const void* input_data, const std::vector& input_dims, void* output_data, const std::vector& output_dims, @@ -49,7 +50,7 @@ Status SliceOutUnwantedOutputSection(const void* input_data, // As a sanity check, ensure that the slice operator's output shape matches with the expected output shape ORT_ENFORCE(compute_metadata.output_dims_ == output_dims); - return SliceCuda::Impl(input_data, input_dims, output_data, compute_metadata, element_size); + return SliceCuda::Impl(stream, input_data, input_dims, output_data, compute_metadata, element_size); } template @@ -195,7 +196,7 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const s_.b_zero = nullptr; } CUDA_CALL_THROW(cudaMalloc(&s_.b_zero, malloc_size)); - CUDA_CALL_THROW(cudaMemset(s_.b_zero, 0, malloc_size)); + CUDA_CALL_THROW(cudaMemsetAsync(s_.b_zero, 0, malloc_size, Stream())); } if (!s_.cached_benchmark_results.contains(x_dims_cudnn)) { @@ -306,7 +307,7 @@ Status Conv::ComputeInternal(OpKernelContext* context) const { // To deal with asymmetric padding, we may have over-padded on one or both sides of the spatial dimensions // This may have lead to extra results that are unnecessary and hence we slice that off here if (s_.post_slicing_required) { - SliceOutUnwantedOutputSection(s_.y_data, s_.y_dims_with_adjusted_pads, s_.Y->MutableDataRaw(), + SliceOutUnwantedOutputSection(Stream(), s_.y_data, s_.y_dims_with_adjusted_pads, s_.Y->MutableDataRaw(), s_.y_dims, s_.slice_starts, s_.slice_ends, s_.slice_axes, s_.element_size); } return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h index e39bda1c59..04f9865a1a 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.h +++ b/onnxruntime/core/providers/cuda/nn/conv.h @@ -189,7 +189,8 @@ class Conv : public CudaKernel { constexpr static auto kDefaultConvAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; }; -Status SliceOutUnwantedOutputSection(const void* input_data, +Status SliceOutUnwantedOutputSection(cudaStream_t stream, + const void* input_data, const std::vector& input_dims, void* output_data, const std::vector& output_dims, diff --git a/onnxruntime/core/providers/cuda/nn/dropout.h b/onnxruntime/core/providers/cuda/nn/dropout.h index 47cf5ce511..5e38a587df 100644 --- a/onnxruntime/core/providers/cuda/nn/dropout.h +++ b/onnxruntime/core/providers/cuda/nn/dropout.h @@ -22,6 +22,7 @@ struct GetRatioDataImpl { template struct DropoutComputeImpl { void operator()(const cudaDeviceProp& prop, + cudaStream_t stream, const int64_t N, const float ratio_data, PhiloxGenerator& generator, @@ -33,7 +34,7 @@ struct DropoutComputeImpl { const CudaT* X_data = reinterpret_cast(X.template Data()); CudaT* Y_data = reinterpret_cast(Y.template MutableData()); - DropoutKernelImpl(prop, N, ratio_data, generator, X_data, Y_data, mask_data); + DropoutKernelImpl(prop, stream, N, ratio_data, generator, X_data, Y_data, mask_data); } }; @@ -81,12 +82,12 @@ Status Dropout::ComputeInternal(OpKernelContext* context) const { const void* X_data = X->DataRaw(); void* Y_data = Y->MutableDataRaw(); if (Y_data != X_data) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y_data, X_data, X->SizeInBytes(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y_data, X_data, X->SizeInBytes(), cudaMemcpyDeviceToDevice, Stream())); } // If mask is requested, return all 1s. if (mask != nullptr) { - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(mask->MutableData(), true, N * sizeof(bool))); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(mask->MutableData(), true, N * sizeof(bool), Stream())); } return Status::OK(); @@ -106,7 +107,7 @@ Status Dropout::ComputeInternal(OpKernelContext* context) const { #else utils::MLTypeCallDispatcher t_disp(X->GetElementType()); #endif - t_disp.Invoke(GetDeviceProp(), N, ratio_data, generator, *X, *Y, mask_data); + t_disp.Invoke(GetDeviceProp(), Stream(), N, ratio_data, generator, *X, *Y, mask_data); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/nn/dropout_impl.cu b/onnxruntime/core/providers/cuda/nn/dropout_impl.cu index ded4a87c40..47d73aa450 100644 --- a/onnxruntime/core/providers/cuda/nn/dropout_impl.cu +++ b/onnxruntime/core/providers/cuda/nn/dropout_impl.cu @@ -69,6 +69,7 @@ __global__ void DropoutKernel( template void DropoutKernelImpl( const cudaDeviceProp& prop, + cudaStream_t stream, const int64_t N, const float ratio, PhiloxGenerator& generator, @@ -83,12 +84,13 @@ void DropoutKernelImpl( const uint64_t counter_offset = static_cast(((N - 1) / (block_size * grid_size * UNROLL) + 1) * UNROLL); auto seeds = generator.NextPhiloxSeeds(counter_offset); - DropoutKernel<<>>(N, ratio, seeds, X_data, Y_data, mask_data); + DropoutKernel<<>>(N, ratio, seeds, X_data, Y_data, mask_data); } #define SPECIALIZED_DROPOUT_IMPL(T) \ template void DropoutKernelImpl( \ const cudaDeviceProp& prop, \ + cudaStream_t stream, \ const int64_t N, \ const float ratio, \ PhiloxGenerator& generator, \ diff --git a/onnxruntime/core/providers/cuda/nn/dropout_impl.h b/onnxruntime/core/providers/cuda/nn/dropout_impl.h index 5c52af1318..37e16710e6 100644 --- a/onnxruntime/core/providers/cuda/nn/dropout_impl.h +++ b/onnxruntime/core/providers/cuda/nn/dropout_impl.h @@ -11,6 +11,7 @@ namespace cuda { template void DropoutKernelImpl( const cudaDeviceProp& prop, + cudaStream_t stream, const int64_t N, const float ratio, PhiloxGenerator& generator, diff --git a/onnxruntime/core/providers/cuda/nn/instance_norm.cc b/onnxruntime/core/providers/cuda/nn/instance_norm.cc index 1bd1d236f7..8945d639fc 100644 --- a/onnxruntime/core/providers/cuda/nn/instance_norm.cc +++ b/onnxruntime/core/providers/cuda/nn/instance_norm.cc @@ -135,6 +135,7 @@ Status InstanceNorm::ComputeInternal(OpKernelContext* p_op_kernel_context) co fast_divmod fdm_C(gsl::narrow_cast(C)); InstanceNormImpl( + Stream(), x_data, scale_data, bias_data, diff --git a/onnxruntime/core/providers/cuda/nn/instance_norm_impl.cu b/onnxruntime/core/providers/cuda/nn/instance_norm_impl.cu index 98cf179601..c0af3d0580 100644 --- a/onnxruntime/core/providers/cuda/nn/instance_norm_impl.cu +++ b/onnxruntime/core/providers/cuda/nn/instance_norm_impl.cu @@ -31,6 +31,7 @@ __global__ void _InstanceNormKernel( template void InstanceNormImpl( + cudaStream_t stream, const T* input_data, const T* scale, const T* bias, @@ -43,12 +44,12 @@ void InstanceNormImpl( T* output_data, size_t N) { int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _InstanceNormKernel<<>>( + _InstanceNormKernel<<>>( input_data, scale, bias, mean, variance, variance_correction, epsilon, fdm_HW, fdm_C, output_data, (CUDA_LONG)N); } #define SPECIALIZED_IMPL(T) \ - template void InstanceNormImpl(const T* input_data, const T* scale, const T* bias, const T* mean, const T* stddev, const double variance_correction, const double epsilon, const fast_divmod& fdm_HW, const fast_divmod& fdm_C, T* output_data, size_t count); + template void InstanceNormImpl(cudaStream_t stream, const T* input_data, const T* scale, const T* bias, const T* mean, const T* stddev, const double variance_correction, const double epsilon, const fast_divmod& fdm_HW, const fast_divmod& fdm_C, T* output_data, size_t count); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/core/providers/cuda/nn/instance_norm_impl.h b/onnxruntime/core/providers/cuda/nn/instance_norm_impl.h index 5748746db6..cda9684416 100644 --- a/onnxruntime/core/providers/cuda/nn/instance_norm_impl.h +++ b/onnxruntime/core/providers/cuda/nn/instance_norm_impl.h @@ -8,6 +8,7 @@ namespace cuda { template void InstanceNormImpl( + cudaStream_t stream, const T* input_data, const T* scale, const T* bias, diff --git a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu index 02335cad1d..2409ee12e3 100644 --- a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu +++ b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu @@ -88,6 +88,7 @@ __global__ void MaxPoolWithIndexKernel( template void MaxPoolWithIndex( + cudaStream_t stream, const TensorShape& input_shape, const TensorShape& output_shape, const std::vector& kernel_shape, @@ -130,7 +131,7 @@ void MaxPoolWithIndex( fast_divmod fdm_d(static_cast(pooled_depth)); int blocksPerGrid = (int)((output_size + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock); - MaxPoolWithIndexKernel<<>>( + MaxPoolWithIndexKernel<<>>( batchs, channels, height, @@ -164,6 +165,7 @@ void MaxPoolWithIndex( #define INSTANTIATEMAXPOOLWITHINDEX(T) \ template void MaxPoolWithIndex( \ + cudaStream_t stream, \ const TensorShape& input_shape, \ const TensorShape& output_shape, \ const std::vector& kernel_shape, \ diff --git a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h index ec796c3d95..3c2420b45b 100644 --- a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h +++ b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h @@ -9,6 +9,7 @@ namespace onnxruntime { namespace cuda { template void MaxPoolWithIndex( + cudaStream_t stream, const TensorShape& input_shape, const TensorShape& output_shape, const std::vector& kernel_shape, diff --git a/onnxruntime/core/providers/cuda/nn/pool.cc b/onnxruntime/core/providers/cuda/nn/pool.cc index 367930d9b3..af9eeb5381 100644 --- a/onnxruntime/core/providers/cuda/nn/pool.cc +++ b/onnxruntime/core/providers/cuda/nn/pool.cc @@ -187,9 +187,9 @@ Status Pool::ComputeInternal(OpKernelContext* context) const { IAllocatorUniquePtr temp_X = GetScratchBuffer(input_count); auto temp_Y = GetScratchBuffer(output_count); - Impl_Cast(reinterpret_cast(x_data), temp_X.get(), input_count); + Impl_Cast(Stream(), reinterpret_cast(x_data), temp_X.get(), input_count); CUDNN_RETURN_IF_ERROR(cudnnPoolingForward(CudnnHandle(), pooling_desc, &alpha, x_tensor, temp_X.get(), &beta, y_tensor, temp_Y.get())); - Impl_Cast(temp_Y.get(), y_data, output_count); + Impl_Cast(Stream(), temp_Y.get(), y_data, output_count); } else { const auto alpha = Consts::One; const auto beta = Consts::Zero; @@ -239,6 +239,7 @@ Status Pool>::ComputeInternal(OpKernelContext* context) const { if (nullptr != I || !this->pool_attrs_.default_dilations) { auto i_data = nullptr == I ? nullptr : I->template MutableData(); MaxPoolWithIndex( + this->Stream(), x_shape, TensorShape(y_dims), kernel_shape, diff --git a/onnxruntime/core/providers/cuda/nn/shrink.cc b/onnxruntime/core/providers/cuda/nn/shrink.cc index 09eb264b74..cd8d9e2cf3 100644 --- a/onnxruntime/core/providers/cuda/nn/shrink.cc +++ b/onnxruntime/core/providers/cuda/nn/shrink.cc @@ -33,7 +33,7 @@ Status Shrink::ComputeInternal(OpKernelContext* p_op_kernel_context) const { Tensor* Y = p_op_kernel_context->Output(0, x_shape); auto* y_data = reinterpret_cast(Y->template MutableData()); - ShrinkImpl(x_data, bias_, lambd_, y_data, x_size); + ShrinkImpl(Stream(), x_data, bias_, lambd_, y_data, x_size); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/nn/shrink_impl.cu b/onnxruntime/core/providers/cuda/nn/shrink_impl.cu index 867822561c..4883c1dd69 100644 --- a/onnxruntime/core/providers/cuda/nn/shrink_impl.cu +++ b/onnxruntime/core/providers/cuda/nn/shrink_impl.cu @@ -51,18 +51,19 @@ __global__ void _ShrinkKernel( template void ShrinkImpl( + cudaStream_t stream, const T* input_data, const float bias, const float lambda, T* output_data, size_t N) { int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _ShrinkKernel<<>>( + _ShrinkKernel<<>>( input_data, bias, lambda, output_data, (CUDA_LONG)N); } #define SPECIALIZED_IMPL(T) \ - template void ShrinkImpl(const T* input_data, const float bias, const float lambda, T* output_data, size_t N); + template void ShrinkImpl(cudaStream_t stream, const T* input_data, const float bias, const float lambda, T* output_data, size_t N); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/core/providers/cuda/nn/shrink_impl.h b/onnxruntime/core/providers/cuda/nn/shrink_impl.h index 05b7183a89..0b324315f7 100644 --- a/onnxruntime/core/providers/cuda/nn/shrink_impl.h +++ b/onnxruntime/core/providers/cuda/nn/shrink_impl.h @@ -8,6 +8,7 @@ namespace cuda { template void ShrinkImpl( + cudaStream_t stream, const T* input_data, const float bias, const float lambda, diff --git a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc index b62575f71a..b75a09b20b 100644 --- a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc +++ b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc @@ -66,6 +66,7 @@ Status NonMaxSuppression::ComputeInternal(OpKernelContext* ctx) const { auto* h_number_selected = static_cast(h_number_selected_ptr.get()); ORT_RETURN_IF_ERROR(NonMaxSuppressionImpl( + Stream(), [this](size_t bytes) { return GetScratchBuffer(bytes); }, pc, GetCenterPointBox(), @@ -120,7 +121,8 @@ Status NonMaxSuppression::ComputeInternal(OpKernelContext* ctx) const { concat_sizes_range_gpu.CopyToGpu(); input_ptr.CopyToGpu(); - ORT_RETURN_IF_ERROR(ConcatImpl(sizeof(int64_t), + ORT_RETURN_IF_ERROR(ConcatImpl(Stream(), + sizeof(int64_t), num_elements, last_dim, concat_sizes_gpu.GpuPtr(), diff --git a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.cu b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.cu index 270b8283db..28cc457c62 100644 --- a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.cu +++ b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.cu @@ -228,7 +228,8 @@ __global__ void NormalizeOutput(const int num_elements, const int* original, int } } -Status NmsGpu(std::function(size_t)> allocator, +Status NmsGpu(cudaStream_t stream, + std::function(size_t)> allocator, const int64_t center_point_box, const float* d_sorted_boxes_float_ptr, const int num_boxes, @@ -249,7 +250,7 @@ Status NmsGpu(std::function(size_t)> allocator, auto* d_nms_mask = static_cast(d_nms_mask_ptr.get()); int blocksPerGrid = (int)(ceil(static_cast(max_nms_mask_size) / GridDim::maxThreadsPerBlock)); - SetZero<<>>(max_nms_mask_size, d_nms_mask); + SetZero<<>>(max_nms_mask_size, d_nms_mask); int* d_delete_mask = d_nms_mask; int* h_selected_count = h_nkeep; @@ -264,7 +265,7 @@ Status NmsGpu(std::function(size_t)> allocator, thread_block.x = kNmsBlockDim; thread_block.y = kNmsBlockDim; thread_block.z = 1; - NMSKernel<<>>(center_point_box, + NMSKernel<<>>(center_point_box, d_sorted_boxes, num_boxes, iou_threshold, @@ -277,9 +278,9 @@ Status NmsGpu(std::function(size_t)> allocator, auto* d_indices = static_cast(d_indices_ptr.get()); blocksPerGrid = (int)(ceil(static_cast(num_boxes) / GridDim::maxThreadsPerBlock)); - Iota<<>>(num_boxes, 0, d_indices); + Iota<<>>(num_boxes, 0, d_indices); - NMSReduce<<<1, 1024, bit_mask_len * sizeof(int)>>>(d_delete_mask, bit_mask_len, num_boxes, max_boxes, d_selected_boxes); + NMSReduce<<<1, 1024, bit_mask_len * sizeof(int), stream>>>(d_delete_mask, bit_mask_len, num_boxes, max_boxes, d_selected_boxes); size_t flagged_buffer_size = 0; CUDA_RETURN_IF_ERROR(cub::DeviceSelect::Flagged(static_cast(nullptr), // temp_storage @@ -288,7 +289,8 @@ Status NmsGpu(std::function(size_t)> allocator, static_cast(nullptr), // selection flag static_cast(nullptr), // selected items static_cast(nullptr), // num_selected - num_boxes)); + num_boxes, + stream)); IAllocatorUniquePtr d_cub_scratch_buffer_ptr{allocator(flagged_buffer_size)}; auto* d_cub_scratch_buffer = static_cast(d_cub_scratch_buffer_ptr.get()); @@ -301,8 +303,10 @@ Status NmsGpu(std::function(size_t)> allocator, d_indices, // input d_selected_boxes, // selection flag d_selected_indices, // selected items - d_num_selected, num_boxes)); - CUDA_RETURN_IF_ERROR(cudaMemcpy(h_selected_count, d_num_selected, sizeof(int), cudaMemcpyDeviceToHost)); + d_num_selected, num_boxes, stream)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(h_selected_count, d_num_selected, sizeof(int), cudaMemcpyDeviceToHost, stream)); + // cudaStreamSynchronize is needed since the value of h_selected_count will be used by host after this function. + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream)); return Status::OK(); } @@ -320,6 +324,7 @@ struct DeviceGreaterThan { } // namespace Status NonMaxSuppressionImpl( + cudaStream_t stream, std::function(size_t)> allocator, const PrepareContext& pc, const int64_t center_point_box, @@ -346,8 +351,8 @@ Status NonMaxSuppressionImpl( static_cast(nullptr), // input indices static_cast(nullptr), // sorted indices num_boxes, // num items - 0, 8 * sizeof(float) // sort all bits - )); + 0, 8 * sizeof(float), // sort all bits + stream)); // allocate temporary memory IAllocatorUniquePtr d_cub_sort_buffer_ptr{allocator(cub_sort_temp_storage_bytes)}; @@ -365,7 +370,7 @@ Status NonMaxSuppressionImpl( // create sequense of indices int blocksPerGrid = (int)(ceil(static_cast(num_boxes) / GridDim::maxThreadsPerBlock)); - Iota<<>>(num_boxes, 0, d_indices); + Iota<<>>(num_boxes, 0, d_indices); CUDA_RETURN_IF_ERROR(cudaGetLastError()); // sort scores @@ -378,23 +383,25 @@ Status NonMaxSuppressionImpl( d_sorted_indices, num_boxes, 0, - 8 * sizeof(float) // sort all bits - )); + 8 * sizeof(float), // sort all bits + stream)); // pick sorted scores const Box* original_boxes = reinterpret_cast(boxes_data); Box* sorted_boxes = reinterpret_cast(d_sorted_boxes); - IndexMultiSelect<<>>(num_boxes, d_sorted_indices, original_boxes, sorted_boxes); + IndexMultiSelect<<>>(num_boxes, d_sorted_indices, original_boxes, sorted_boxes); CUDA_RETURN_IF_ERROR(cudaGetLastError()); // STEP 2. filter boxes by scores int limited_num_boxes = num_boxes; if (pc.score_threshold_ != nullptr) { + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream)); thrust::device_ptr sorted_scores_device_ptr(d_sorted_scores); limited_num_boxes = thrust::count_if( sorted_scores_device_ptr, sorted_scores_device_ptr + num_boxes, DeviceGreaterThan(score_threshold)); + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(0)); CUDA_RETURN_IF_ERROR(cudaGetLastError()); if (limited_num_boxes == 0) { @@ -404,7 +411,8 @@ Status NonMaxSuppressionImpl( } // STEP 3. launch NMS kernels - ORT_RETURN_IF_ERROR(NmsGpu(allocator, + ORT_RETURN_IF_ERROR(NmsGpu(stream, + allocator, center_point_box, d_sorted_boxes, limited_num_boxes, @@ -424,8 +432,8 @@ Status NonMaxSuppressionImpl( auto* d_normalized_output_indices = static_cast(d_normalized_output_indices_ptr.get()); blocksPerGrid = (int)(ceil(static_cast(num_to_keep) / GridDim::maxThreadsPerBlock)); - IndexMultiSelect<<>>(num_to_keep, d_selected_indices, d_sorted_indices, d_output_indices); - NormalizeOutput<<>>(num_to_keep, d_output_indices, d_normalized_output_indices, batch_index, class_index); + IndexMultiSelect<<>>(num_to_keep, d_selected_indices, d_sorted_indices, d_output_indices); + NormalizeOutput<<>>(num_to_keep, d_output_indices, d_normalized_output_indices, batch_index, class_index); CUDA_RETURN_IF_ERROR(cudaGetLastError()); selected_indices = std::move(d_normalized_output_indices_ptr); diff --git a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.h b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.h index 493c115e52..648420125c 100644 --- a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.h +++ b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.h @@ -14,6 +14,7 @@ namespace onnxruntime { namespace cuda { Status NonMaxSuppressionImpl( + cudaStream_t stream, std::function(size_t)> allocator, const PrepareContext& pc, const int64_t center_point_box, diff --git a/onnxruntime/core/providers/cuda/object_detection/roialign.cc b/onnxruntime/core/providers/cuda/object_detection/roialign.cc index 5ca757382f..513b82f6aa 100644 --- a/onnxruntime/core/providers/cuda/object_detection/roialign.cc +++ b/onnxruntime/core/providers/cuda/object_detection/roialign.cc @@ -45,6 +45,7 @@ Status RoiAlign::ComputeInternal(OpKernelContext* context) const { if (output_size > 0) { RoiAlignImpl( + Stream(), output_size, // num threads reinterpret_cast::MappedType*>(X_ptr->template Data()), ToCudaType::FromFloat(this->spatial_scale_), diff --git a/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu b/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu index 45a35b291e..937007f57b 100644 --- a/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu +++ b/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu @@ -174,6 +174,7 @@ __global__ void RoIAlignForward( template void RoiAlignImpl( + cudaStream_t stream, const int64_t nthreads, const T* bottom_data, const T spatial_scale, @@ -189,7 +190,7 @@ void RoiAlignImpl( const bool is_mode_avg, const int64_t* batch_indices_ptr) { int blocksPerGrid = (int)(ceil(static_cast(nthreads) / GridDim::maxThreadsPerBlock)); - RoIAlignForward<<>>( + RoIAlignForward<<>>( nthreads, bottom_data, spatial_scale, @@ -208,6 +209,7 @@ void RoiAlignImpl( #define SPECIALIZED_IMPL(T) \ template void RoiAlignImpl( \ + cudaStream_t stream, \ const int64_t nthreads, \ const T* bottom_data, \ const T spatial_scale, \ diff --git a/onnxruntime/core/providers/cuda/object_detection/roialign_impl.h b/onnxruntime/core/providers/cuda/object_detection/roialign_impl.h index 712771bdf6..312c35a93a 100644 --- a/onnxruntime/core/providers/cuda/object_detection/roialign_impl.h +++ b/onnxruntime/core/providers/cuda/object_detection/roialign_impl.h @@ -12,6 +12,7 @@ namespace cuda { template void RoiAlignImpl( + cudaStream_t stream, const int64_t nthreads, const T* bottom_data, const T spatial_scale, diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu index ad8533b00c..6ac4e64900 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu +++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu @@ -284,7 +284,7 @@ __global__ void reduce_matrix_columns_kernel( template Status call_reduce_matrix_columns( - const TIn* input, TOut* output, const int num_rows, const int num_cols, void* buffer, size_t buffer_size) { + cudaStream_t stream, const TIn* input, TOut* output, const int num_rows, const int num_cols, void* buffer, size_t buffer_size) { ORT_ENFORCE(num_rows >= 0 && num_cols >= 0); using TBuf = AccumulationType_t; @@ -301,12 +301,12 @@ Status call_reduce_matrix_columns( // If more than one block is used per grid row, then inter-block reduction is needed. if (grid_dim.x > 1) { - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(block_done_counts_buffer, 0, num_rows * sizeof(int))); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(block_done_counts_buffer, 0, num_rows * sizeof(int), stream)); } const int shared_mem_size = sizeof(TBuf) * block_dim.x * block_dim.y / GPU_WARP_SIZE; reduce_matrix_columns_kernel - <<>>( + <<>>( num_rows, num_cols, input, output, block_reductions_buffer, block_done_counts_buffer); return Status::OK(); @@ -315,41 +315,41 @@ Status call_reduce_matrix_columns( template Status reduce_sum( - const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { + cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { return detail::call_reduce_matrix_columns( - input, output, 1, size, buffer, buffer_size); + stream, input, output, 1, size, buffer, buffer_size); } template Status reduce_square_sum( - const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { + cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { return detail::call_reduce_matrix_columns( - input, output, 1, size, buffer, buffer_size); + stream, input, output, 1, size, buffer, buffer_size); } template Status reduce_l2_norm( - const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { + cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { return detail::call_reduce_matrix_columns( - input, output, 1, size, buffer, buffer_size); + stream, input, output, 1, size, buffer, buffer_size); } template Status reduce_mean( - const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { + cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { return detail::call_reduce_matrix_columns( - input, output, 1, size, buffer, buffer_size); + stream, input, output, 1, size, buffer, buffer_size); } #define INSTANTIATE_REDUCE_SUM(TIn, TOut) \ - template Status reduce_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) + template Status reduce_sum(cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) INSTANTIATE_REDUCE_SUM(half, float); INSTANTIATE_REDUCE_SUM(float, float); INSTANTIATE_REDUCE_SUM(double, double); #undef INSTANTIATE_REDUCE_SUM #define INSTANTIATE_REDUCE_SQUARE_SUM(TIn, TOut) \ - template Status reduce_square_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) + template Status reduce_square_sum(cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) INSTANTIATE_REDUCE_SQUARE_SUM(half, float); INSTANTIATE_REDUCE_SQUARE_SUM(float, float); INSTANTIATE_REDUCE_SQUARE_SUM(double, double); @@ -359,14 +359,14 @@ INSTANTIATE_REDUCE_SQUARE_SUM(nv_bfloat16, float); #undef INSTANTIATE_REDUCE_SQUARE_SUM #define INSTANTIATE_REDUCE_L2_NORM(TIn, TOut) \ - template Status reduce_l2_norm(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) + template Status reduce_l2_norm(cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) INSTANTIATE_REDUCE_L2_NORM(half, float); INSTANTIATE_REDUCE_L2_NORM(float, float); INSTANTIATE_REDUCE_L2_NORM(double, double); #undef INSTANTIATE_REDUCE_L2_NORM #define INSTANTIATE_REDUCE_MEAN(TIn, TOut) \ - template Status reduce_mean(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) + template Status reduce_mean(cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) INSTANTIATE_REDUCE_MEAN(half, float); INSTANTIATE_REDUCE_MEAN(float, float); INSTANTIATE_REDUCE_MEAN(double, double); @@ -431,11 +431,11 @@ __global__ void reduce_matrix_rows_kernel(const TIn* input, TOut* output, int m, } template -Status call_reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool reset_initial_output) { +Status call_reduce_matrix_rows(cudaStream_t stream, const TIn* input, TOut* output, int m, int n, bool reset_initial_output) { ORT_ENFORCE(m >= 0 && n >= 0); if (reset_initial_output) { - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output, 0, n * sizeof(TOut))); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output, 0, n * sizeof(TOut), stream)); } constexpr int max_num_threads_in_block = 512; @@ -450,7 +450,7 @@ Status call_reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, boo const dim3 grid(grid_x_dim, grid_y_dim, 1); const dim3 block(block_x_dim, block_y_dim, 1); - reduce_matrix_rows_kernel<<>>( + reduce_matrix_rows_kernel<<>>( input, output, m, n); return Status::OK(); @@ -458,13 +458,13 @@ Status call_reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, boo } // namespace detail template -Status reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool reset_initial_output) { +Status reduce_matrix_rows(cudaStream_t stream, const TIn* input, TOut* output, int m, int n, bool reset_initial_output) { using TBuf = AccumulationType_t; - return detail::call_reduce_matrix_rows(input, output, m, n, reset_initial_output); + return detail::call_reduce_matrix_rows(stream, input, output, m, n, reset_initial_output); } #define INSTANTIATE_REDUCE_MATRIX_ROWS(T) \ - template Status reduce_matrix_rows(const T* input, T* output, int m, int n, bool reset_initial_output) + template Status reduce_matrix_rows(cudaStream_t stream, const T* input, T* output, int m, int n, bool reset_initial_output) INSTANTIATE_REDUCE_MATRIX_ROWS(half); INSTANTIATE_REDUCE_MATRIX_ROWS(float); INSTANTIATE_REDUCE_MATRIX_ROWS(double); @@ -474,13 +474,13 @@ INSTANTIATE_REDUCE_MATRIX_ROWS(nv_bfloat16); #undef INSTANTIATE_REDUCE_MATRIX_ROWS template -Status reduce_matrix_columns(const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size) { +Status reduce_matrix_columns(cudaStream_t stream, const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size) { return detail::call_reduce_matrix_columns( - input, output, m, n, buffer, buffer_size); + stream, input, output, m, n, buffer, buffer_size); } #define INSTANTIATE_REDUCE_MATRIX_COLUMNS(T) \ - template Status reduce_matrix_columns(const T* input, T* output, int m, int n, void* buffer, size_t buffer_size) + template Status reduce_matrix_columns(cudaStream_t stream, const T* input, T* output, int m, int n, void* buffer, size_t buffer_size) INSTANTIATE_REDUCE_MATRIX_COLUMNS(half); INSTANTIATE_REDUCE_MATRIX_COLUMNS(float); INSTANTIATE_REDUCE_MATRIX_COLUMNS(double); diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.h b/onnxruntime/core/providers/cuda/reduction/reduction_functions.h index 69988862aa..965de5a2bd 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.h +++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.h @@ -43,19 +43,19 @@ size_t compute_reduction_buffer_size(int size) { /** Computes the sum of the given elements. */ template -Status reduce_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); +Status reduce_sum(cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); /** Computes the sum of the squares of the given elements. */ template -Status reduce_square_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); +Status reduce_square_sum(cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); /** Computes the L2 norm of the given elements. */ template -Status reduce_l2_norm(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); +Status reduce_l2_norm(cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); /** Computes the mean of the given elements. */ template -Status reduce_mean(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); +Status reduce_mean(cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); enum class ApplicableMatrixReduction { // can use reduce_matrix_rows() @@ -89,7 +89,7 @@ ApplicableMatrixReduction get_applicable_matrix_reduction( * @param reset_initial_output Whether to reset (i.e., zero) the output values first. */ template -Status reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool reset_initial_output = true); +Status reduce_matrix_rows(cudaStream_t stream, const TIn* input, TOut* output, int m, int n, bool reset_initial_output = true); /** * Reduces the columns in a row-major matrix to a single column containing the sum of each row. @@ -101,7 +101,7 @@ Status reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool res * @param buffer_size The size of the intermediate buffer in bytes. */ template -Status reduce_matrix_columns(const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size); +Status reduce_matrix_columns(cudaStream_t stream, const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc index c4d6bc11c0..612dee590a 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc @@ -149,6 +149,7 @@ Status ReduceKernel::ReduceKernelShared( switch (applicable_matrix_reduction) { case ApplicableMatrixReduction::Rows: { return reduce_matrix_rows( + Stream(), reinterpret_cast(X), reinterpret_cast(Y), m, n, false); @@ -167,7 +168,7 @@ Status ReduceKernel::ReduceKernelShared( // ArgMax/ArgMin with FP16 are not supported by cudnn, so convert input to fp32 then call cudnn temp_X = GetScratchBuffer(input_count); cudnn_type_X = CUDNN_DATA_FLOAT; - Impl_Cast(reinterpret_cast(X), temp_X.get(), input_shape.Size()); + Impl_Cast(Stream(), reinterpret_cast(X), temp_X.get(), input_shape.Size()); } // CUDNN requires at least 3D input, so pad 1s if needed @@ -208,7 +209,7 @@ Status ReduceKernel::ReduceKernelShared( input_data_buffer = GetScratchBuffer(input_count); input_data = reinterpret_cast(input_data_buffer.get()); fast_divmod tmp_div; - Impl_Mul(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + Impl_Mul(Stream(), static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(X), nullptr, reinterpret_cast(X), nullptr, tmp_div, tmp_div, @@ -233,7 +234,8 @@ Status ReduceKernel::ReduceKernelShared( auto log_sum_result = log_sum_result_buffer.get(); BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(prepare.BinaryElementwiseBroadcastPrepareHelper(input_shape, rhs_shape, input_shape)); - Impl_Sub(prepare.output_rank_or_simple_broadcast, + Impl_Sub(Stream(), + prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast(X), &prepare.rhs_padded_strides, @@ -242,7 +244,7 @@ Status ReduceKernel::ReduceKernelShared( prepare.fdm_H, prepare.fdm_C, reinterpret_cast(exp_result), input_count); - Impl_Exp(reinterpret_cast(exp_result), + Impl_Exp(Stream(), reinterpret_cast(exp_result), reinterpret_cast(exp_result), input_count); @@ -253,13 +255,13 @@ Status ReduceKernel::ReduceKernelShared( &zero, output_tensor, reinterpret_cast(log_sum_result))); // Log(Sum) - Impl_Log(reinterpret_cast(log_sum_result), + Impl_Log(Stream(), reinterpret_cast(log_sum_result), reinterpret_cast(log_sum_result), output_count); // Log + ReduceMax fast_divmod tmp_div; - Impl_Add(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + Impl_Add(Stream(), static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(log_sum_result), nullptr, reinterpret_cast(Y), nullptr, tmp_div, tmp_div, @@ -276,7 +278,7 @@ Status ReduceKernel::ReduceKernelShared( // cudnnReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case if (input_count == output_count) { if (reinterpret_cast(Y) != reinterpret_cast(X)) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y, X, input_count * sizeof(T), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y, X, input_count * sizeof(T), cudaMemcpyDeviceToDevice, Stream())); } } else { CUDNN_RETURN_IF_ERROR(cudnnReduceTensor( @@ -301,11 +303,11 @@ Status ReduceKernel::ReduceKernelShared( } // CUDA reduction index is uint32_t for now, cast it to int64_t according to ONNX spec - Impl_Cast(reinterpret_cast(indices_cuda.get()), reinterpret_cast(Y), output_count); + Impl_Cast(Stream(), reinterpret_cast(indices_cuda.get()), reinterpret_cast(Y), output_count); } if (calculate_log_) { - Impl_Log(reinterpret_cast(Y), + Impl_Log(Stream(), reinterpret_cast(Y), reinterpret_cast(Y), output_count); } @@ -421,7 +423,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr std::vector& output_dims = prepare_reduce_metadata.output_dims; std::vector& input_dims_cudnn = prepare_reduce_metadata.input_dims_cudnn; std::vector& output_dims_cudnn = prepare_reduce_metadata.output_dims_cudnn; - + cudaStream_t stream = static_cast(cuda_ep.GetComputeStream()); // special case when there is a dim value of 0 in the shape. if (input_count == 0) { assert(output.Shape().Size() == 0); @@ -436,6 +438,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr switch (applicable_matrix_reduction) { case ApplicableMatrixReduction::Rows: { return reduce_matrix_rows( + stream, reinterpret_cast(input.template Data()), reinterpret_cast(output.template MutableData()), m, n); @@ -444,6 +447,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr const auto buffer_size_bytes = compute_reduce_matrix_columns_buffer_size(m, n); auto buffer = cuda_ep.GetScratchBuffer(buffer_size_bytes); return reduce_matrix_columns( + stream, reinterpret_cast(input.template Data()), reinterpret_cast(output.template MutableData()), m, n, buffer.get(), buffer_size_bytes); @@ -455,7 +459,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. // Therefore zeroing out the memory is required - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output.MutableDataRaw(), 0, output.SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output.MutableDataRaw(), 0, output.SizeInBytes(), stream)); IAllocatorUniquePtr temp_X; cudnnDataType_t cudnn_type_X = CudnnTensor::GetDataType(); @@ -464,7 +468,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr // ArgMax/ArgMin with FP16 are not supported by cudnn, so convert input to fp32 then call cudnn temp_X = cuda_ep.GetScratchBuffer(input_count); cudnn_type_X = CUDNN_DATA_FLOAT; - Impl_Cast(reinterpret_cast(input.template Data()), temp_X.get(), input_shape.Size()); + Impl_Cast(stream, reinterpret_cast(input.template Data()), temp_X.get(), input_shape.Size()); } CudnnReduceDescriptor reduce_desc; @@ -497,7 +501,8 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr input_data_buffer = cuda_ep.GetScratchBuffer(input_count); input_data = reinterpret_cast(input_data_buffer.get()); fast_divmod tmp_div; - Impl_Mul(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + Impl_Mul(stream, + static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(input.template Data()), nullptr, reinterpret_cast(input.template Data()), nullptr, tmp_div, tmp_div, @@ -507,7 +512,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr // This happens when the input is Scalar if (input_count == output_count) { if (output.template MutableData() != input.template Data()) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), cudaMemcpyDeviceToDevice, stream)); } } else { // Reduce max -- Max/Min will output indices data @@ -536,7 +541,8 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr auto log_sum_result = log_sum_result_buffer.get(); BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(prepare.BinaryElementwiseBroadcastPrepareHelper(input_shape, output_shape, input_shape)); - Impl_Sub(prepare.output_rank_or_simple_broadcast, + Impl_Sub(stream, + prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast(input.template Data()), &prepare.rhs_padded_strides, @@ -545,14 +551,15 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr prepare.fdm_H, prepare.fdm_C, reinterpret_cast(exp_result), input_count); - Impl_Exp(reinterpret_cast(exp_result), + Impl_Exp(stream, + reinterpret_cast(exp_result), reinterpret_cast(exp_result), input_count); // cudnnReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case // This happens when the input is Scalar. We do not need to add anything in this case. if (input_count == output_count) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(reinterpret_cast(log_sum_result), exp_result, input_count * sizeof(T), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(reinterpret_cast(log_sum_result), exp_result, input_count * sizeof(T), cudaMemcpyDeviceToDevice, stream)); } else { // ReduceSum CUDNN_RETURN_IF_ERROR(cudnnReduceTensor( @@ -563,13 +570,13 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr } // Log(Sum) - Impl_Log(reinterpret_cast(log_sum_result), + Impl_Log(stream, reinterpret_cast(log_sum_result), reinterpret_cast(log_sum_result), output_count); // Log + ReduceMax fast_divmod tmp_div; - Impl_Add(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + Impl_Add(stream, static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(log_sum_result), nullptr, reinterpret_cast(output.template MutableData()), nullptr, tmp_div, tmp_div, @@ -581,7 +588,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr // cudnnReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case // This happens when the input is Scalar. We do not need to add anything in this case. if (input_count == output_count) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(reinterpret_cast(output.template MutableData()), input_data, input_count * sizeof(T), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(reinterpret_cast(output.template MutableData()), input_data, input_count * sizeof(T), cudaMemcpyDeviceToDevice, stream)); } else { CUDNN_RETURN_IF_ERROR(cudnnReduceTensor( cuda_ep.PerThreadCudnnHandle(), reduce_desc, indices_cuda.get(), indices_bytes, @@ -593,7 +600,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr // cudnnReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case if (input_count == output_count) { if (output.template MutableData() != input.template Data()) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), cudaMemcpyDeviceToDevice, stream)); } } else { CUDNN_RETURN_IF_ERROR(cudnnReduceTensor( @@ -603,7 +610,8 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr &zero, output_tensor, reinterpret_cast(output.template MutableData()))); } } - } else { // For ArgMax & ArgMin ops, use the indicies as the output with int64 type + } else { + // For ArgMax & ArgMin ops, use the indicies as the output with int64 type // cudnnReduceTensor has issue if input and output has same size, which will happen if the axis to be reduced has dim value of 1. // the output is zeros of the output size if (input_count == output_count) { @@ -626,12 +634,13 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr } // CUDA reduction index is uint32_t for now, cast it to int64_t according to ONNX spec - Impl_Cast(reinterpret_cast(indices_cuda.get()), output.template MutableData(), output_count); + Impl_Cast(stream, reinterpret_cast(indices_cuda.get()), output.template MutableData(), output_count); } } if (calculate_log) { - Impl_Log(reinterpret_cast(output.template MutableData()), + Impl_Log(stream, + reinterpret_cast(output.template MutableData()), reinterpret_cast(output.template MutableData()), output_count); } @@ -661,7 +670,7 @@ Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, cudnnRe // empty axes and no-op if (axes.empty() && noop_with_empty_axes_) { auto* Y = ctx->Output(0, X->Shape()); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), cudaMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -700,7 +709,7 @@ Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, cudnnRe if (axes.empty() && noop_with_empty_axes_) { \ auto* Y = ctx->Output(0, X->Shape()); \ CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), \ - cudaMemcpyDeviceToDevice)); \ + cudaMemcpyDeviceToDevice, Stream())); \ return Status::OK(); \ } \ \ @@ -722,12 +731,12 @@ Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, cudnnRe if (input_count == output_count) { \ if (Y->template MutableData() != X->template Data()) { \ CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), \ - input_count * sizeof(T), cudaMemcpyDeviceToDevice)); \ + input_count * sizeof(T), cudaMemcpyDeviceToDevice, Stream())); \ } \ return Status::OK(); \ } \ \ - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes())); \ + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes(), Stream())); \ \ size_t indices_bytes = 0; \ size_t workspace_bytes = 0; \ @@ -737,7 +746,7 @@ Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, cudnnRe \ cudnnDataType_t cudnn_type_X = CUDNN_DATA_FLOAT; \ IAllocatorUniquePtr temp_X = GetScratchBuffer(input_count); \ - Impl_Cast(reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); \ + Impl_Cast(Stream(), reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); \ \ ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, CUDNN_REDUCE_TENSOR_FLATTENED_INDICES)); \ ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_cudnn, cudnn_type_X)); \ @@ -756,7 +765,7 @@ Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, cudnnRe workspace_cuda.get(), workspace_bytes, &one, input_tensor, temp_X.get(), \ &zero, output_tensor, temp_Y.get())); \ \ - Impl_Cast(temp_Y.get(), reinterpret_cast(Y->template MutableData()), output_count); \ + Impl_Cast(Stream(), temp_Y.get(), reinterpret_cast(Y->template MutableData()), output_count); \ \ return Status::OK(); \ } @@ -788,7 +797,7 @@ Status ReduceKernel::ComputeImpl if (axes.empty() && noop_with_empty_axes_) { auto* Y = ctx->Output(0, X->Shape()); CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), - X->SizeInBytes(), cudaMemcpyDeviceToDevice)); + X->SizeInBytes(), cudaMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -810,7 +819,7 @@ Status ReduceKernel::ComputeImpl if (input_count == output_count) { if (Y->template MutableData() != X->template Data()) { CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), - input_count * sizeof(BFloat16), cudaMemcpyDeviceToDevice)); + input_count * sizeof(BFloat16), cudaMemcpyDeviceToDevice, Stream())); } return Status::OK(); } @@ -821,13 +830,13 @@ Status ReduceKernel::ComputeImpl get_applicable_matrix_reduction(cudnn_reduce_op, X->Shape().GetDims(), axes, m, n); switch (applicable_matrix_reduction) { case ApplicableMatrixReduction::Rows: { - return reduce_matrix_rows(reinterpret_cast(X->template Data()), + return reduce_matrix_rows(Stream(), reinterpret_cast(X->template Data()), reinterpret_cast(Y->template MutableData()), m, n); } case ApplicableMatrixReduction::Columns: { const auto buffer_size_bytes = compute_reduce_matrix_columns_buffer_size(m, n); auto buffer = cuda_ep_->GetScratchBuffer(buffer_size_bytes); - return reduce_matrix_columns(reinterpret_cast(X->template Data()), + return reduce_matrix_columns(Stream(), reinterpret_cast(X->template Data()), reinterpret_cast(Y->template MutableData()), m, n, buffer.get(), buffer_size_bytes); } @@ -836,7 +845,7 @@ Status ReduceKernel::ComputeImpl } } - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes(), Stream())); size_t indices_bytes = 0; size_t workspace_bytes = 0; @@ -846,7 +855,7 @@ Status ReduceKernel::ComputeImpl cudnnDataType_t cudnn_type_X = CUDNN_DATA_FLOAT; IAllocatorUniquePtr temp_X = GetScratchBuffer(input_count); - Impl_Cast(reinterpret_cast(X->template Data()), temp_X.get(), + Impl_Cast(Stream(), reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, CUDNN_REDUCE_TENSOR_FLATTENED_INDICES)); @@ -866,7 +875,7 @@ Status ReduceKernel::ComputeImpl workspace_cuda.get(), workspace_bytes, &one, input_tensor, temp_X.get(), &zero, output_tensor, temp_Y.get())); - Impl_Cast(temp_Y.get(), reinterpret_cast(Y->template MutableData()), output_count); + Impl_Cast(Stream(), temp_Y.get(), reinterpret_cast(Y->template MutableData()), output_count); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc index 24d1f4e3c7..01c237e0f5 100644 --- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc +++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc @@ -34,7 +34,7 @@ void CudnnRnnBase::SetWeightBias(const cudnnHandle_t handle, cudnnGetFilterNdDescriptor(filter_desc, 3, &dt, &tf, &numDims, matDims.data()); int count = matDims[0] * matDims[1] * matDims[2]; - cudaMemcpyAsync(mem_offset, pos + offset, count * sizeof(T), cudaMemcpyDeviceToDevice); + CUDA_CALL_THROW(cudaMemcpyAsync(mem_offset, pos + offset, count * sizeof(T), cudaMemcpyDeviceToDevice, Stream())); offset += count; } template @@ -190,7 +190,8 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { if (reverse_) { // reverse input data x_reversed_data = GetScratchBuffer(seq_length * batch_size * input_size); - ReverseBySequence(gsl::narrow_cast(seq_length), + ReverseBySequence(Stream(), + gsl::narrow_cast(seq_length), gsl::narrow_cast(batch_size), gsl::narrow_cast(input_size), reinterpret_cast(x_data), @@ -331,14 +332,16 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { y_reorganized_data = GetScratchBuffer(output_size); if (reverse_) { //reverse output data - ReverseBySequence(gsl::narrow_cast(seq_length), + ReverseBySequence(Stream(), + gsl::narrow_cast(seq_length), gsl::narrow_cast(batch_size), gsl::narrow_cast(hidden_size_), reinterpret_cast(y_data), reinterpret_cast(y_reorganized_data.get()), output_size); } else { - ReorderBidirectionalDataInSequence(gsl::narrow_cast(seq_length), + ReorderBidirectionalDataInSequence(Stream(), + gsl::narrow_cast(seq_length), gsl::narrow_cast(batch_size), gsl::narrow_cast(hidden_size_), reinterpret_cast(y_data), @@ -348,7 +351,7 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { if (Y != nullptr) { // User specified this optional output, so need to copy the reversed data to orignial place - cudaMemcpyAsync(y_data, y_reorganized_data.get(), output_size * sizeof(T), cudaMemcpyDeviceToDevice); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(y_data, y_reorganized_data.get(), output_size * sizeof(T), cudaMemcpyDeviceToDevice, Stream())); } else { y_data = y_reorganized_data.get(); } @@ -363,7 +366,8 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { CudaAsyncBuffer sequence_lens_buffer(this, batch_size); memcpy(sequence_lens_buffer.CpuPtr(), sequence_lens_data, batch_size * sizeof(int32_t)); ORT_RETURN_IF_ERROR(sequence_lens_buffer.CopyToGpu()); - RnnMaskImpl(gsl::narrow_cast(num_directions_), + RnnMaskImpl(Stream(), + gsl::narrow_cast(num_directions_), gsl::narrow_cast(seq_length), gsl::narrow_cast(batch_size), gsl::narrow_cast(hidden_size_), @@ -386,7 +390,8 @@ void CudnnRnnBase::SetZeroSequences(const int64_t zero_seq_index_cache_size, CudaAsyncBuffer zero_seq_index_cache_async_buffer(this, zero_seq_index_cache_size); memcpy(zero_seq_index_cache_async_buffer.CpuPtr(), zero_seq_index_cache.data(), zero_seq_index_cache_size * sizeof(int32_t)); ORT_THROW_IF_ERROR(zero_seq_index_cache_async_buffer.CopyToGpu()); - MaskZeroSequences(gsl::narrow_cast(hidden_size_), + MaskZeroSequences(Stream(), + gsl::narrow_cast(hidden_size_), reinterpret_cast(y_data), reinterpret_cast(y_h_data), reinterpret_cast(y_c_data), diff --git a/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu b/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu index 930c3a4ddd..d485855ddb 100644 --- a/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu +++ b/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu @@ -22,7 +22,8 @@ __global__ void _ReverseBySequenceKernel(const int32_t seq_length, } template -void ReverseBySequence(const int32_t seq_length, +void ReverseBySequence(cudaStream_t stream, + const int32_t seq_length, const int32_t batch_size, const int32_t input_or_hidden_size, const T* data, @@ -32,7 +33,7 @@ void ReverseBySequence(const int32_t seq_length, int32_t block_size = batch_size * input_or_hidden_size; fast_divmod div_batch_block(block_size); int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _ReverseBySequenceKernel<<>>( + _ReverseBySequenceKernel<<>>( seq_length, block_size, div_batch_block, data, reversed_data, (CUDA_LONG)N); } @@ -61,7 +62,8 @@ __global__ void _BidirectionalDataKernel(const int32_t seq_length, } template -void ReorderBidirectionalDataInSequence(const int32_t seq_length, +void ReorderBidirectionalDataInSequence(cudaStream_t stream, + const int32_t seq_length, const int32_t batch_size, const int32_t hidden_size, const T* data, @@ -74,7 +76,7 @@ void ReorderBidirectionalDataInSequence(const int32_t seq_length, fast_divmod div_output_block(hidden_size); int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _BidirectionalDataKernel<<>>( + _BidirectionalDataKernel<<>>( seq_length, batch_size, hidden_size, seq_block_size, div_seq_block, div_output_block, data, reordered_data, (CUDA_LONG)N); @@ -116,7 +118,8 @@ __global__ void _RnnMaskKernel(const int32_t seq_length, } template -void RnnMaskImpl(const int32_t num_directions, +void RnnMaskImpl(cudaStream_t stream, + const int32_t num_directions, const int32_t seq_length, const int32_t batch_size, const int32_t hidden_size, @@ -128,7 +131,7 @@ void RnnMaskImpl(const int32_t num_directions, fast_divmod div_dir_block(batch_size * hidden_size); fast_divmod div_batch_block(hidden_size); int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _RnnMaskKernel<<>>( + _RnnMaskKernel<<>>( seq_length, batch_size, hidden_size, sequence_lens, div_seq_block, div_dir_block, div_batch_block, y_output_data, y_h_output_data, (CUDA_LONG)N); } @@ -164,19 +167,21 @@ __global__ void _MaskZeroSequences(const int32_t hidden_size, } template -void MaskZeroSequences(const int32_t hidden_size, +void MaskZeroSequences(cudaStream_t stream, + const int32_t hidden_size, T* y_output_data, T* y_h_output_data, T* y_c_output_data, const int32_t* zeor_seq_index_cache, const size_t N) { int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _MaskZeroSequences<<>>( + _MaskZeroSequences<<>>( hidden_size, y_output_data, y_h_output_data, y_c_output_data, zeor_seq_index_cache, (CUDA_LONG)N); } #define SPECIALIZED_RNN_IMPL(T) \ - template void RnnMaskImpl(const int32_t num_directions, \ + template void RnnMaskImpl(cudaStream_t stream, \ + const int32_t num_directions, \ const int32_t seq_length, \ const int32_t batch_size, \ const int32_t hidden_size, \ @@ -184,19 +189,22 @@ void MaskZeroSequences(const int32_t hidden_size, T* y_output_data, \ T* y_h_output_data, \ const size_t N); \ - template void ReverseBySequence(const int32_t seq_length, \ + template void ReverseBySequence(cudaStream_t stream, \ + const int32_t seq_length, \ const int32_t batch_size, \ const int32_t hidden_size, \ const T* data, \ T* reversed_data, \ const size_t N); \ - template void ReorderBidirectionalDataInSequence(const int32_t seq_length, \ + template void ReorderBidirectionalDataInSequence(cudaStream_t stream,\ + const int32_t seq_length, \ const int32_t batch_size, \ const int32_t hidden_size,\ const T* data, \ T* reordered_data, \ const size_t N); \ -template void MaskZeroSequences(const int32_t hidden_size, \ +template void MaskZeroSequences(cudaStream_t stream, \ + const int32_t hidden_size, \ T* y_output_data, \ T* y_h_output_data, \ T* y_c_output_data, \ diff --git a/onnxruntime/core/providers/cuda/rnn/rnn_impl.h b/onnxruntime/core/providers/cuda/rnn/rnn_impl.h index 78ceabf23b..0c00c2d2a9 100644 --- a/onnxruntime/core/providers/cuda/rnn/rnn_impl.h +++ b/onnxruntime/core/providers/cuda/rnn/rnn_impl.h @@ -9,7 +9,8 @@ namespace onnxruntime { namespace cuda { template -void ReverseBySequence(const int32_t seq_length, +void ReverseBySequence(cudaStream_t stream, + const int32_t seq_length, const int32_t batch_size, const int32_t input_or_hidden_size, const T* data, @@ -17,7 +18,8 @@ void ReverseBySequence(const int32_t seq_length, const size_t N); template -void ReorderBidirectionalDataInSequence(const int32_t seq_length, +void ReorderBidirectionalDataInSequence(cudaStream_t stream, + const int32_t seq_length, const int32_t batch_size, const int32_t hidden_size, const T* data, @@ -25,7 +27,8 @@ void ReorderBidirectionalDataInSequence(const int32_t seq_length, const size_t N); template -void RnnMaskImpl(const int32_t num_directions, +void RnnMaskImpl(cudaStream_t stream, + const int32_t num_directions, const int32_t seq_length, const int32_t batch_size, const int32_t hidden_size, @@ -35,7 +38,8 @@ void RnnMaskImpl(const int32_t num_directions, const size_t N); template -void MaskZeroSequences(const int32_t hidden_size, +void MaskZeroSequences(cudaStream_t stream, + const int32_t hidden_size, T* y_output_data, T* y_h_output_data, T* y_c_output_data, diff --git a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h index 1efd51d1ba..483934990b 100644 --- a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h +++ b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h @@ -34,14 +34,14 @@ template class IConstantBuffer { public: virtual ~IConstantBuffer(){}; - virtual const T* GetBuffer(size_t count) = 0; + virtual const T* GetBuffer(cudaStream_t stream, size_t count) = 0; }; template std::unique_ptr> CreateConstantOnes(); template -void Fill(T* output, T value, int64_t count); +void Fill(cudaStream_t stream, T* output, T value, int64_t count); /* This is a utility wrapper for arbitrary type array diff --git a/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h b/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h index 90fdd2aea5..6b46550fae 100644 --- a/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h +++ b/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h @@ -417,24 +417,24 @@ inline cublasStatus_t cublasGemmStridedBatchedHelper(cublasHandle_t handle, #endif // transpose using geam -inline cublasStatus_t cublasTransposeHelper(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb, float* C, int ldc) { +inline cublasStatus_t cublasTransposeHelper(cudaStream_t, cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb, float* C, int ldc) { return cublasSgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); } -inline cublasStatus_t cublasTransposeHelper(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const double* alpha, const double* A, int lda, const double* beta, const double* B, int ldb, double* C, int ldc) { +inline cublasStatus_t cublasTransposeHelper(cudaStream_t, cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const double* alpha, const double* A, int lda, const double* beta, const double* B, int ldb, double* C, int ldc) { return cublasDgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); } -cublasStatus_t cublasTransposeHelper(cublasHandle_t, cublasOperation_t, cublasOperation_t, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int); +cublasStatus_t cublasTransposeHelper(cudaStream_t, cublasHandle_t, cublasOperation_t, cublasOperation_t, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int); // copy -inline cublasStatus_t cublasCopyHelper(cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy) { +inline cublasStatus_t cublasCopyHelper(cudaStream_t, cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy) { return cublasScopy(handle, n, x, incx, y, incy); } -inline cublasStatus_t cublasCopyHelper(cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy) { +inline cublasStatus_t cublasCopyHelper(cudaStream_t, cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy) { return cublasDcopy(handle, n, x, incx, y, incy); } -cublasStatus_t cublasCopyHelper(cublasHandle_t handle, int n, const half* x, int incx, half* y, int incy); +cublasStatus_t cublasCopyHelper(cudaStream_t stream, cublasHandle_t handle, int n, const half* x, int incx, half* y, int incy); #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 -cublasStatus_t cublasCopyHelper(cublasHandle_t handle, int n, const nv_bfloat16* x, int incx, nv_bfloat16* y, int incy); +cublasStatus_t cublasCopyHelper(cudaStream_t stream, cublasHandle_t handle, int n, const nv_bfloat16* x, int incx, nv_bfloat16* y, int incy); #endif diff --git a/onnxruntime/core/providers/cuda/tensor/cast_op.cc b/onnxruntime/core/providers/cuda/tensor/cast_op.cc index 156597e3a0..fe5146c17c 100644 --- a/onnxruntime/core/providers/cuda/tensor/cast_op.cc +++ b/onnxruntime/core/providers/cuda/tensor/cast_op.cc @@ -71,6 +71,7 @@ Status Cast::ComputeInternal(OpKernelContext* context) const { case TP_TYPE: \ if (count > 0) { \ Impl_Cast::MappedType>( \ + Stream(), \ x_data, \ reinterpret_cast::MappedType*>(Y->template MutableData()), \ count); \ diff --git a/onnxruntime/core/providers/cuda/tensor/compress.cc b/onnxruntime/core/providers/cuda/tensor/compress.cc index 56445f8333..91dd3f5222 100644 --- a/onnxruntime/core/providers/cuda/tensor/compress.cc +++ b/onnxruntime/core/providers/cuda/tensor/compress.cc @@ -52,10 +52,24 @@ Status Compress::ComputeInternal(OpKernelContext* ctx) const { auto condition_cumulative_sum_buffer = GetScratchBuffer(valid_condition_length); auto condition_cumulative_sum = condition_cumulative_sum_buffer.get(); - PrefixSumImpl(reinterpret_cast(condition_data), condition_cumulative_sum, valid_condition_length); + size_t temp_storage_bytes = 0; + CUDA_RETURN_IF_ERROR(CompressCalcPrefixSumTempStorageBytes(Stream(), + reinterpret_cast(condition_data), + condition_cumulative_sum, + static_cast(valid_condition_length), + temp_storage_bytes)); + auto temp_buffer = GetScratchBuffer(temp_storage_bytes); + auto d_temp_storage = temp_buffer.get(); + CUDA_RETURN_IF_ERROR(CompressInclusivePrefixSum(Stream(), + d_temp_storage, + temp_storage_bytes, + reinterpret_cast(condition_data), + condition_cumulative_sum, + static_cast(valid_condition_length))); + // cudaMemcpyAsync from device memory to pageable host memory will return only once the copy has completed. int32_t positive_condition_count = 0; - CUDA_RETURN_IF_ERROR(cudaMemcpy(&positive_condition_count, condition_cumulative_sum + valid_condition_length - 1, sizeof(int32_t), cudaMemcpyDeviceToHost)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(&positive_condition_count, condition_cumulative_sum + valid_condition_length - 1, sizeof(int32_t), cudaMemcpyDeviceToHost, Stream())); std::vector output_dims(input_dimensions); if (has_axis_) { @@ -80,7 +94,8 @@ Status Compress::ComputeInternal(OpKernelContext* ctx) const { } } - ORT_RETURN_IF_ERROR(CompressImpl(element_bytes, + ORT_RETURN_IF_ERROR(CompressImpl(Stream(), + element_bytes, gsl::narrow_cast(valid_condition_length), gsl::narrow_cast(axis_right_stride), has_axis_ ? gsl::narrow_cast(input_dimensions[axis]) diff --git a/onnxruntime/core/providers/cuda/tensor/compress_impl.cu b/onnxruntime/core/providers/cuda/tensor/compress_impl.cu index 58d4102936..6f936e5965 100644 --- a/onnxruntime/core/providers/cuda/tensor/compress_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/compress_impl.cu @@ -3,21 +3,25 @@ #include "core/providers/cuda/cu_inc/common.cuh" #include "core/providers/cuda/cuda_common.h" -#include "compress_impl.h" + //TODO:fix the warnings #ifdef _MSC_VER #pragma warning(disable : 4244) #endif -#include -#include + +#include "core/providers/cuda/tensor/compress_impl.h" +#include namespace onnxruntime { namespace cuda { -void PrefixSumImpl(const int8_t* condition_data, - int32_t* condition_cumulative_sum, - const size_t length) { - thrust::inclusive_scan(thrust::device, condition_data, condition_data + length, condition_cumulative_sum); +cudaError_t CompressCalcPrefixSumTempStorageBytes(cudaStream_t stream, const int8_t* condition_data, int* condition_cumulative_sum, int length, size_t& temp_storage_bytes) { + return cub::DeviceScan::InclusiveSum( + nullptr, temp_storage_bytes, condition_data, condition_cumulative_sum, length, stream); +} +cudaError_t CompressInclusivePrefixSum(cudaStream_t stream, void* d_temp_storage, size_t temp_storage_bytes, const int8_t* condition_data, int* condition_cumulative_sum, int length) { + return cub::DeviceScan::InclusiveSum( + d_temp_storage, temp_storage_bytes, condition_data, condition_cumulative_sum, length, stream); } template @@ -44,7 +48,8 @@ __global__ void _CompressKernel(const int32_t valid_condition_length, } } -Status CompressImpl(const size_t element_bytes, +Status CompressImpl(cudaStream_t stream, + const size_t element_bytes, const int32_t valid_condition_length, const int32_t axis_right_stride, const int32_t input_axis_dim_length, @@ -62,7 +67,7 @@ Status CompressImpl(const size_t element_bytes, switch (element_bytes) { case sizeof(int8_t): - _CompressKernel<<>>( + _CompressKernel<<>>( valid_condition_length, axis_right_stride_div, input_axis_included_stride_div, @@ -74,7 +79,7 @@ Status CompressImpl(const size_t element_bytes, (CUDA_LONG)N); break; case sizeof(int16_t): - _CompressKernel<<>>( + _CompressKernel<<>>( valid_condition_length, axis_right_stride_div, input_axis_included_stride_div, @@ -86,7 +91,7 @@ Status CompressImpl(const size_t element_bytes, (CUDA_LONG)N); break; case sizeof(int32_t): - _CompressKernel<<>>( + _CompressKernel<<>>( valid_condition_length, axis_right_stride_div, input_axis_included_stride_div, @@ -98,7 +103,7 @@ Status CompressImpl(const size_t element_bytes, (CUDA_LONG)N); break; case sizeof(int64_t): - _CompressKernel<<>>( + _CompressKernel<<>>( valid_condition_length, axis_right_stride_div, input_axis_included_stride_div, diff --git a/onnxruntime/core/providers/cuda/tensor/compress_impl.h b/onnxruntime/core/providers/cuda/tensor/compress_impl.h index 08005944cc..3397841476 100644 --- a/onnxruntime/core/providers/cuda/tensor/compress_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/compress_impl.h @@ -9,11 +9,11 @@ namespace onnxruntime { namespace cuda { -void PrefixSumImpl(const int8_t* condition_data, - int32_t* condition_cumulative_sum, - const size_t length); +cudaError_t CompressCalcPrefixSumTempStorageBytes(cudaStream_t stream, const int8_t* condition_data, int* condition_cumulative_sum, int length, size_t& temp_storage_bytes); +cudaError_t CompressInclusivePrefixSum(cudaStream_t stream, void* d_temp_storage, size_t temp_storage_bytes, const int8_t* condition_data, int* condition_cumulative_sum, int length); -Status CompressImpl(const size_t element_bytes, +Status CompressImpl(cudaStream_t stream, + const size_t element_bytes, const int32_t valid_condition_length, const int32_t axis_right_stride, const int32_t input_axis_dim_length, diff --git a/onnxruntime/core/providers/cuda/tensor/concat.cc b/onnxruntime/core/providers/cuda/tensor/concat.cc index 309d9c8243..dfeace4b7e 100644 --- a/onnxruntime/core/providers/cuda/tensor/concat.cc +++ b/onnxruntime/core/providers/cuda/tensor/concat.cc @@ -77,7 +77,8 @@ Status Concat::ComputeInternal(OpKernelContext* ctx) const { int block_size_inside_axis_dim = static_cast(p.output_axis_pitch / p.output_tensor->Shape()[p.axis]); int block_size_including_axis_dim = static_cast(p.output_axis_pitch); auto element_bytes = p.output_tensor->DataType()->Size(); - ORT_RETURN_IF_ERROR(ConcatImpl(element_bytes, + ORT_RETURN_IF_ERROR(ConcatImpl(Stream(), + element_bytes, block_size_including_axis_dim, block_size_inside_axis_dim, concat_sizes_gpu.GpuPtr(), diff --git a/onnxruntime/core/providers/cuda/tensor/concat_impl.cu b/onnxruntime/core/providers/cuda/tensor/concat_impl.cu index 2a24efe9ca..6047f12189 100644 --- a/onnxruntime/core/providers/cuda/tensor/concat_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/concat_impl.cu @@ -38,7 +38,8 @@ __global__ void _ConcatKernel(const fast_divmod block_size_including_axis_dim_di output_data[id] = reinterpret_cast(input_ptr[input_index])[input_pos]; } -Status ConcatImpl(const size_t element_bytes, +Status ConcatImpl(cudaStream_t stream, + const size_t element_bytes, const int block_size_including_axis_dim, const int block_size_inside_axis_dim, const int64_t* concat_sizes, @@ -54,7 +55,7 @@ Status ConcatImpl(const size_t element_bytes, switch (element_bytes) { case sizeof(int8_t): - _ConcatKernel<<>>( + _ConcatKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping, reinterpret_cast(output_data), @@ -62,7 +63,7 @@ Status ConcatImpl(const size_t element_bytes, (CUDA_LONG)N); break; case sizeof(int16_t): - _ConcatKernel<<>>( + _ConcatKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping, reinterpret_cast(output_data), @@ -70,7 +71,7 @@ Status ConcatImpl(const size_t element_bytes, (CUDA_LONG)N); break; case sizeof(int32_t): - _ConcatKernel<<>>( + _ConcatKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping, reinterpret_cast(output_data), @@ -78,7 +79,7 @@ Status ConcatImpl(const size_t element_bytes, (CUDA_LONG)N); break; case sizeof(int64_t): - _ConcatKernel<<>>( + _ConcatKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping, reinterpret_cast(output_data), diff --git a/onnxruntime/core/providers/cuda/tensor/concat_impl.h b/onnxruntime/core/providers/cuda/tensor/concat_impl.h index 110bf5bf32..2a3b6ba9f9 100644 --- a/onnxruntime/core/providers/cuda/tensor/concat_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/concat_impl.h @@ -9,7 +9,8 @@ namespace onnxruntime { namespace cuda { -Status ConcatImpl(const size_t element_bytes, +Status ConcatImpl(cudaStream_t stream, + const size_t element_bytes, const int block_size_including_axis_dim, const int block_size_inside_axis_dim, const int64_t* concat_sizes, diff --git a/onnxruntime/core/providers/cuda/tensor/expand.cc b/onnxruntime/core/providers/cuda/tensor/expand.cc index a4040261d4..8ee8e3df91 100644 --- a/onnxruntime/core/providers/cuda/tensor/expand.cc +++ b/onnxruntime/core/providers/cuda/tensor/expand.cc @@ -98,6 +98,7 @@ Status Expand::ComputeInternal(OpKernelContext* ctx) const { } return ExpandImpl( + Stream(), input_data_tensor.DataType()->Size(), gsl::narrow_cast(output_shape.Size()), gsl::narrow_cast(input_data_tensor.Shape().Size()), diff --git a/onnxruntime/core/providers/cuda/tensor/expand_impl.cu b/onnxruntime/core/providers/cuda/tensor/expand_impl.cu index 79a7dababa..fe7716696a 100644 --- a/onnxruntime/core/providers/cuda/tensor/expand_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/expand_impl.cu @@ -22,11 +22,11 @@ __global__ void _FillFromDataPtrKernel(T* output_data, const T* input_data, CUDA } template -void FillFromDataPtr(T* output_data, const T* input_data, int64_t count) { +void FillFromDataPtr(cudaStream_t stream, T* output_data, const T* input_data, int64_t count) { int blocksPerGrid = gsl::narrow_cast(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); CUDA_LONG N = static_cast(count); _FillFromDataPtrKernel - <<>>(output_data, input_data, N); + <<>>(output_data, input_data, N); } template @@ -89,10 +89,11 @@ __global__ void ExpandKernel( } } -Status ExpandByFill(const size_t element_size, const int N, const void* input_data, void* output_data) { +Status ExpandByFill(cudaStream_t stream, const size_t element_size, const int N, const void* input_data, void* output_data) { #define EXPAND_FILL_ON(TYPE) \ case sizeof(TYPE): \ - FillFromDataPtr(reinterpret_cast(output_data), \ + FillFromDataPtr(stream, \ + reinterpret_cast(output_data), \ reinterpret_cast(input_data), \ static_cast(N)); \ break @@ -109,6 +110,7 @@ Status ExpandByFill(const size_t element_size, const int N, const void* input_da } Status Expand2D( + cudaStream_t stream, const size_t element_size, const int N, const void* input_data, @@ -118,7 +120,7 @@ Status Expand2D( const int input_view_stride1) { #define EXPAND2D_ON(TYPE) \ case sizeof(TYPE): \ - ExpandKernel2D<<>>( \ + ExpandKernel2D<<>>( \ N, reinterpret_cast(input_data), reinterpret_cast(output_data), \ fdm_output_stride0, input_view_stride0, input_view_stride1); \ break @@ -136,6 +138,7 @@ Status Expand2D( } Status ExpandImpl( + cudaStream_t stream, const size_t element_size, const int N_output, const int N_input, @@ -146,12 +149,12 @@ Status ExpandImpl( const int rank = static_cast(output_strides.Size()); if (rank == 1) { if (N_input == N_output) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, N_output * element_size, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, N_output * element_size, cudaMemcpyDeviceToDevice, stream)); } else { // N_input == 1 - return ExpandByFill(element_size, N_output, input_data, output_data); + return ExpandByFill(stream, element_size, N_output, input_data, output_data); } } else if (rank == 2) { - return Expand2D(element_size, N_output, input_data, output_data, + return Expand2D(stream, element_size, N_output, input_data, output_data, output_strides[0], static_cast(input_strides[0]), static_cast(input_strides[1])); @@ -162,7 +165,7 @@ Status ExpandImpl( #define EXPAND_ON(TYPE) \ case sizeof(TYPE): \ ExpandKernel \ - <<>>( \ + <<>>( \ rank, N_output, reinterpret_cast(input_data), reinterpret_cast(output_data), \ output_strides, input_strides); \ break diff --git a/onnxruntime/core/providers/cuda/tensor/expand_impl.h b/onnxruntime/core/providers/cuda/tensor/expand_impl.h index 27d5d69d9c..e64c601323 100644 --- a/onnxruntime/core/providers/cuda/tensor/expand_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/expand_impl.h @@ -12,6 +12,7 @@ namespace onnxruntime { namespace cuda { Status ExpandImpl( + cudaStream_t stream, const size_t element_size, const int N_output, const int N_input, diff --git a/onnxruntime/core/providers/cuda/tensor/eye_like.cc b/onnxruntime/core/providers/cuda/tensor/eye_like.cc index 61ae265d60..82ea145da9 100644 --- a/onnxruntime/core/providers/cuda/tensor/eye_like.cc +++ b/onnxruntime/core/providers/cuda/tensor/eye_like.cc @@ -35,6 +35,7 @@ ONNX_OPERATOR_KERNEL_EX( #define TYPED_FUNCTION_CALL(T) \ EyeLikeImpl::MappedType>( \ + Stream(), \ offset, \ dim1 + 1, \ reinterpret_cast::MappedType*>(T2->template MutableData()), \ @@ -52,7 +53,7 @@ Status EyeLike::ComputeInternal(OpKernelContext* context) const { // set output tensor shape same as input tensor and set all values to zero auto* T2 = context->Output(0, input_dims); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(T2->MutableDataRaw(), 0, T2->SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(T2->MutableDataRaw(), 0, T2->SizeInBytes(), Stream())); auto dim0 = input_dims[0]; auto dim1 = input_dims[1]; diff --git a/onnxruntime/core/providers/cuda/tensor/eye_like_impl.cu b/onnxruntime/core/providers/cuda/tensor/eye_like_impl.cu index 8f1216e43c..a3e588a288 100644 --- a/onnxruntime/core/providers/cuda/tensor/eye_like_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/eye_like_impl.cu @@ -23,6 +23,7 @@ __global__ void _EyeLikeKernel( template void EyeLikeImpl( + cudaStream_t stream, size_t offset, size_t stripe, T* output_data, @@ -31,11 +32,12 @@ void EyeLikeImpl( int blocksPerGrid = (int)(ceil(static_cast(diag_count) / block_size)); CUDA_LONG N = static_cast(diag_count); - _EyeLikeKernel<<>>(offset, stripe, output_data, N); + _EyeLikeKernel<<>>(offset, stripe, output_data, N); } #define SPECIALIZED_IMPL(T) \ template void EyeLikeImpl( \ + cudaStream_t stream, \ size_t offset, \ size_t stripe, \ T* output_data, \ diff --git a/onnxruntime/core/providers/cuda/tensor/eye_like_impl.h b/onnxruntime/core/providers/cuda/tensor/eye_like_impl.h index f95ca63782..db06a2d3ea 100644 --- a/onnxruntime/core/providers/cuda/tensor/eye_like_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/eye_like_impl.h @@ -12,6 +12,7 @@ namespace cuda { template void EyeLikeImpl( + cudaStream_t stream, size_t offset, // offset of first element in diagnal size_t stripe, // stripe, here it's width + 1 T* output_data, // output buffer diff --git a/onnxruntime/core/providers/cuda/tensor/flatten.cc b/onnxruntime/core/providers/cuda/tensor/flatten.cc index 0ac18a5dff..7f36a5fdb0 100644 --- a/onnxruntime/core/providers/cuda/tensor/flatten.cc +++ b/onnxruntime/core/providers/cuda/tensor/flatten.cc @@ -66,7 +66,7 @@ Status Flatten::ComputeInternal(OpKernelContext* ctx) const { void* target = Y->MutableDataRaw(); if (target != source) { CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target, source, X_shape.Size() * X->DataType()->Size(), - cudaMemcpyDeviceToDevice)); + cudaMemcpyDeviceToDevice, Stream())); } return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/tensor/gather.cc b/onnxruntime/core/providers/cuda/tensor/gather.cc index d857c5719a..f648a59e06 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather.cc +++ b/onnxruntime/core/providers/cuda/tensor/gather.cc @@ -77,6 +77,7 @@ Status Gather::ComputeInternal(OpKernelContext* context) const { if (p.indices_tensor->IsDataType() || p.indices_tensor->IsDataType()) { GatherImpl( + Stream(), input_block_size, indices_max, divmod_output_block_size, diff --git a/onnxruntime/core/providers/cuda/tensor/gather_elements.cc b/onnxruntime/core/providers/cuda/tensor/gather_elements.cc index 82da342db7..6ada2248f7 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_elements.cc +++ b/onnxruntime/core/providers/cuda/tensor/gather_elements.cc @@ -77,6 +77,7 @@ Status GatherElements::ComputeInternal(OpKernelContext* context) const { if (indices_tensor->IsDataType() || indices_tensor->IsDataType()) { GatherElementsImpl( + Stream(), input_rank, input_tensor->DataRaw(), input_dims[axis], diff --git a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu index fc4cc644c1..87920a7fcb 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu @@ -83,6 +83,7 @@ __global__ void _GatherElementsKernel( } void GatherElementsImpl( + cudaStream_t stream, const int64_t rank, const void* input_data, const int64_t input_dim_along_axis, @@ -103,7 +104,7 @@ void GatherElementsImpl( switch (element_size) { case sizeof(int8_t): { using CudaType = typename ToCudaType::MappedType; - _GatherElementsKernel<<>>( + _GatherElementsKernel<<>>( rank, reinterpret_cast(input_data), input_dim_along_axis, input_strides, indices_data, indices_size, index_element_size, indices_strides, axis, reinterpret_cast(output_data)); @@ -111,7 +112,7 @@ void GatherElementsImpl( case sizeof(int16_t): { using CudaType = typename ToCudaType::MappedType; - _GatherElementsKernel<<>>( + _GatherElementsKernel<<>>( rank, reinterpret_cast(input_data), input_dim_along_axis, input_strides, indices_data, indices_size, index_element_size, indices_strides, axis, reinterpret_cast(output_data)); @@ -119,7 +120,7 @@ void GatherElementsImpl( case sizeof(int32_t): { using CudaType = typename ToCudaType::MappedType; - _GatherElementsKernel<<>>( + _GatherElementsKernel<<>>( rank, reinterpret_cast(input_data), input_dim_along_axis, input_strides, indices_data, indices_size, index_element_size, indices_strides, axis, reinterpret_cast(output_data)); @@ -127,7 +128,7 @@ void GatherElementsImpl( case sizeof(int64_t): { using CudaType = typename ToCudaType::MappedType; - _GatherElementsKernel<<>>( + _GatherElementsKernel<<>>( rank, reinterpret_cast(input_data), input_dim_along_axis, input_strides, indices_data, indices_size, index_element_size, indices_strides, axis, reinterpret_cast(output_data)); diff --git a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.h b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.h index 1caaea647c..920415678b 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.h @@ -10,6 +10,7 @@ namespace onnxruntime { namespace cuda { void GatherElementsImpl( + cudaStream_t stream, const int64_t rank, // both inputs have same rank and this is validated in the main Compute const void* input_data, const int64_t input_dim_along_axis, diff --git a/onnxruntime/core/providers/cuda/tensor/gather_impl.cu b/onnxruntime/core/providers/cuda/tensor/gather_impl.cu index aa42e6d6fb..2fb91e7ce5 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/gather_impl.cu @@ -52,6 +52,7 @@ __global__ void _GatherKernel( } void GatherImpl( + cudaStream_t stream, const int64_t input_block_size, const int64_t indices_max, const fast_divmod& output_block_size, @@ -68,28 +69,28 @@ void GatherImpl( switch (element_size) { case sizeof(int8_t): { using CudaType = typename ToCudaType::MappedType; - _GatherKernel<<>>( + _GatherKernel<<>>( input_block_size, indices_max, output_block_size, block_size, indices_data, index_element_size, reinterpret_cast(input_data), reinterpret_cast(output_data), (CUDA_LONG)N); } break; case sizeof(int16_t): { using CudaType = typename ToCudaType::MappedType; - _GatherKernel<<>>( + _GatherKernel<<>>( input_block_size, indices_max, output_block_size, block_size, indices_data, index_element_size, reinterpret_cast(input_data), reinterpret_cast(output_data), (CUDA_LONG)N); } break; case sizeof(int32_t): { using CudaType = typename ToCudaType::MappedType; - _GatherKernel<<>>( + _GatherKernel<<>>( input_block_size, indices_max, output_block_size, block_size, indices_data, index_element_size, reinterpret_cast(input_data), reinterpret_cast(output_data), (CUDA_LONG)N); } break; case sizeof(int64_t): { using CudaType = typename ToCudaType::MappedType; - _GatherKernel<<>>( + _GatherKernel<<>>( input_block_size, indices_max, output_block_size, block_size, indices_data, index_element_size, reinterpret_cast(input_data), reinterpret_cast(output_data), (CUDA_LONG)N); diff --git a/onnxruntime/core/providers/cuda/tensor/gather_impl.h b/onnxruntime/core/providers/cuda/tensor/gather_impl.h index 11af5c3888..03fd1dee46 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/gather_impl.h @@ -9,6 +9,7 @@ namespace onnxruntime { namespace cuda { void GatherImpl( + cudaStream_t stream, const int64_t input_block_size, const int64_t indices_max, const fast_divmod& output_block_size, diff --git a/onnxruntime/core/providers/cuda/tensor/gather_nd.cc b/onnxruntime/core/providers/cuda/tensor/gather_nd.cc index 209fd57eca..1fd4b3f89e 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_nd.cc +++ b/onnxruntime/core/providers/cuda/tensor/gather_nd.cc @@ -40,6 +40,7 @@ Status CheckBatchDimensionsMatch( template Status GatherNDBase::PrepareCompute( + cudaStream_t stream, const int64_t batch_dims, const TensorShape& input_shape, const TensorShape& indices_shape, @@ -70,13 +71,14 @@ Status GatherNDBase::PrepareCompute( sizes_from_slice_dims_buffer.get(), sizes_from_slice_dims.data(), sizes_from_slice_dims.size() * sizeof(int64_t), - cudaMemcpyHostToDevice)); + cudaMemcpyHostToDevice, stream)); input_slice_offsets_buffer = GetScratchBuffer(num_slices); TArray input_dims(input_shape.GetDims()); ComputeSliceOffsetsImpl( + stream, batch_dims, input_dims, num_slices, @@ -145,13 +147,15 @@ REGISTER_KERNEL_VERSIONED_TYPED_GATHER_ND(int64_t, 12, 12) template struct GatherNDComputeImpl { - void operator()(const int64_t num_slices, + void operator()(cudaStream_t stream, + const int64_t num_slices, const int64_t slice_size, const void* const kernel_input_data, void* const kernel_output_data, int64_t* const input_slice_offsets_data) const { typedef typename ToCudaType::MappedType CudaT; - GatherNDImpl(num_slices, kernel_input_data, + GatherNDImpl(stream, + num_slices, kernel_input_data, kernel_output_data, slice_size, input_slice_offsets_data); } @@ -191,14 +195,15 @@ Status GatherND::ComputeInternal(OpKernelContext* context) const { int64_t num_slices; int64_t slice_size; IAllocatorUniquePtr input_slice_offsets_buffer; - ORT_RETURN_IF_ERROR(PrepareCompute(batch_dims_, input_shape, indices_shape, indices_tensor, + ORT_RETURN_IF_ERROR(PrepareCompute(Stream(), + batch_dims_, input_shape, indices_shape, indices_tensor, num_slices, slice_size, input_slice_offsets_buffer)); const void* const kernel_input_data = input_tensor->DataRaw(); void* const kernel_output_data = output_tensor->MutableDataRaw(); utils::MLTypeCallDispatcher t_disp(input_tensor->GetElementType()); - t_disp.Invoke(num_slices, slice_size, kernel_input_data, kernel_output_data, input_slice_offsets_buffer.get()); + t_disp.Invoke(Stream(), num_slices, slice_size, kernel_input_data, kernel_output_data, input_slice_offsets_buffer.get()); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/tensor/gather_nd.h b/onnxruntime/core/providers/cuda/tensor/gather_nd.h index 56414fad69..527a4b8c54 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_nd.h +++ b/onnxruntime/core/providers/cuda/tensor/gather_nd.h @@ -23,6 +23,7 @@ class GatherNDBase : public CudaKernel { protected: template Status PrepareCompute( + cudaStream_t stream, const int64_t batch_dims, const TensorShape& input_shape, const TensorShape& indices_shape, diff --git a/onnxruntime/core/providers/cuda/tensor/gather_nd_impl.cu b/onnxruntime/core/providers/cuda/tensor/gather_nd_impl.cu index c0323acaec..3f0275547c 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_nd_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/gather_nd_impl.cu @@ -52,6 +52,7 @@ __global__ void _GatherNDKernel( template void ComputeSliceOffsetsImpl( + cudaStream_t stream, const int64_t batch_dims, const TArray input_dims, const size_t num_slices, @@ -62,7 +63,7 @@ void ComputeSliceOffsetsImpl( const TIndex* const indices_data, // num_slices * num_slice_dims elements int64_t* const input_slice_offsets_data) { // num_slices elements const unsigned int blocks_per_grid = static_cast(CeilDiv(num_slices, GridDim::maxThreadsPerBlock)); - _ComputeSliceOffsetsKernel<<>>( + _ComputeSliceOffsetsKernel<<>>( batch_dims, input_dims, num_slices, @@ -76,18 +77,20 @@ void ComputeSliceOffsetsImpl( template void GatherNDImpl( + cudaStream_t stream, const size_t num_slices, const void* input_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data) { const unsigned int blocks_per_grid = static_cast(CeilDiv(num_slices * slice_size, GridDim::maxThreadsPerBlock)); - _GatherNDKernel<<>>( + _GatherNDKernel<<>>( num_slices, static_cast(input_data), static_cast(output_data), slice_size, input_slice_offsets_data); } #define SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(TIndex) \ template void ComputeSliceOffsetsImpl( \ + cudaStream_t stream, \ const int64_t batch_dims, \ const TArray input_dims, \ const size_t num_slices, \ @@ -99,7 +102,7 @@ void GatherNDImpl( int64_t* const input_slice_offsets_data); #define SPECIALIZED_IMPL(T) \ - template void GatherNDImpl(const size_t num_slices, const void* input_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data); + template void GatherNDImpl(cudaStream_t stream, const size_t num_slices, const void* input_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data); SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(int32_t) SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(int64_t) diff --git a/onnxruntime/core/providers/cuda/tensor/gather_nd_impl.h b/onnxruntime/core/providers/cuda/tensor/gather_nd_impl.h index e989fb330a..828f6ab6af 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_nd_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/gather_nd_impl.h @@ -9,6 +9,7 @@ namespace cuda { template void ComputeSliceOffsetsImpl( + cudaStream_t stream, const int64_t batch_dims, const TArray input_dims, const size_t num_slices, @@ -21,6 +22,7 @@ void ComputeSliceOffsetsImpl( template void GatherNDImpl( + cudaStream_t stream, const size_t num_slices, const void* input_data, void* output_data, @@ -30,6 +32,7 @@ void GatherNDImpl( #ifdef ENABLE_TRAINING template void GatherNDGradImpl( + cudaStream_t stream, const size_t num_slices, const void* update_data, void* output_data, diff --git a/onnxruntime/core/providers/cuda/tensor/identity_op.h b/onnxruntime/core/providers/cuda/tensor/identity_op.h index 3a6c48b31c..f00bb6414d 100644 --- a/onnxruntime/core/providers/cuda/tensor/identity_op.h +++ b/onnxruntime/core/providers/cuda/tensor/identity_op.h @@ -25,7 +25,7 @@ class IdentityOp final : public CudaKernel { void* target = Y->MutableDataRaw(X_type); //If source and target pointers are not equal, we need to copy the data. if (target != source) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target, source, X->Shape().Size() * X->DataType()->Size(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target, source, X->Shape().Size() * X->DataType()->Size(), cudaMemcpyDeviceToDevice, Stream())); } if (is_dropout) { @@ -39,7 +39,7 @@ class IdentityOp final : public CudaKernel { void* mask_data = mask->MutableDataRaw(); // In 'test'/'inference' mode, there are no input values dropped out // so fill the buffer with 0/false - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(mask_data, 0, mask->SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(mask_data, 0, mask->SizeInBytes(), Stream())); } } diff --git a/onnxruntime/core/providers/cuda/tensor/nonzero_impl.cu b/onnxruntime/core/providers/cuda/tensor/nonzero_impl.cu index 1ac1ae79e9..90be2b8b27 100644 --- a/onnxruntime/core/providers/cuda/tensor/nonzero_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/nonzero_impl.cu @@ -17,15 +17,15 @@ int NonZeroCalcBlockCount(int64_t x_size) { } cudaError_t NonZeroCalcPrefixSumTempStorageBytes( - int* prefix_counts, int number_of_blocks, size_t& temp_storage_bytes) { + cudaStream_t stream, int* prefix_counts, int number_of_blocks, size_t& temp_storage_bytes) { temp_storage_bytes = 0; - return cub::DeviceScan::InclusiveSum(nullptr, temp_storage_bytes, prefix_counts, prefix_counts, number_of_blocks); + return cub::DeviceScan::InclusiveSum(nullptr, temp_storage_bytes, prefix_counts, prefix_counts, number_of_blocks, stream); } cudaError_t NonZeroInclusivePrefixSum( - void* d_temp_storage, size_t temp_storage_bytes, int* prefix_counts, int number_of_blocks) { + cudaStream_t stream, void* d_temp_storage, size_t temp_storage_bytes, int* prefix_counts, int number_of_blocks) { return cub::DeviceScan::InclusiveSum( - d_temp_storage, temp_storage_bytes, prefix_counts, prefix_counts, number_of_blocks); + d_temp_storage, temp_storage_bytes, prefix_counts, prefix_counts, number_of_blocks, stream); } template @@ -70,37 +70,37 @@ __global__ void NonZeroOutputPositionsKernel( } template -cudaError_t NonZeroCountEachBlock(const InputT* x, int64_t x_size, int* count_in_blocks) { +cudaError_t NonZeroCountEachBlock(cudaStream_t stream, const InputT* x, int64_t x_size, int* count_in_blocks) { int num_blocks = NonZeroCalcBlockCount(x_size); - NonZeroCountEachBlockKernel<<>>( + NonZeroCountEachBlockKernel<<>>( x, x_size, count_in_blocks); return cudaSuccess; } template cudaError_t NonZeroOutputPositions( - const InputT* x, int64_t x_size, int x_rank, const TArray& x_strides, + cudaStream_t stream, const InputT* x, int64_t x_size, int x_rank, const TArray& x_strides, const int* prefix_counts, int nonzero_elements, int64_t* results) { int num_blocks = NonZeroCalcBlockCount(x_size); - NonZeroOutputPositionsKernel<<>>( + NonZeroOutputPositionsKernel<<>>( x, x_size, x_rank, x_strides, prefix_counts, nonzero_elements, results); return cudaSuccess; } -template cudaError_t NonZeroCountEachBlock(const bool*, int64_t, int*); -template cudaError_t NonZeroCountEachBlock(const uint8_t*, int64_t, int*); -template cudaError_t NonZeroCountEachBlock(const int64_t*, int64_t, int*); -template cudaError_t NonZeroCountEachBlock(const int32_t*, int64_t, int*); -template cudaError_t NonZeroCountEachBlock(const float*, int64_t, int*); -template cudaError_t NonZeroCountEachBlock(const half*, int64_t, int*); +template cudaError_t NonZeroCountEachBlock(cudaStream_t stream, const bool*, int64_t, int*); +template cudaError_t NonZeroCountEachBlock(cudaStream_t stream, const uint8_t*, int64_t, int*); +template cudaError_t NonZeroCountEachBlock(cudaStream_t stream, const int64_t*, int64_t, int*); +template cudaError_t NonZeroCountEachBlock(cudaStream_t stream, const int32_t*, int64_t, int*); +template cudaError_t NonZeroCountEachBlock(cudaStream_t stream, const float*, int64_t, int*); +template cudaError_t NonZeroCountEachBlock(cudaStream_t stream, const half*, int64_t, int*); -template cudaError_t NonZeroOutputPositions(const bool*, int64_t, int, const TArray&, const int*, int, int64_t*); -template cudaError_t NonZeroOutputPositions(const uint8_t*, int64_t, int, const TArray&, const int*, int, int64_t*); -template cudaError_t NonZeroOutputPositions(const int64_t*, int64_t, int, const TArray&, const int*, int, int64_t*); -template cudaError_t NonZeroOutputPositions(const int32_t*, int64_t, int, const TArray&, const int*, int, int64_t*); -template cudaError_t NonZeroOutputPositions(const float*, int64_t, int, const TArray&, const int*, int, int64_t*); -template cudaError_t NonZeroOutputPositions(const half*, int64_t, int, const TArray&, const int*, int, int64_t*); +template cudaError_t NonZeroOutputPositions(cudaStream_t stream, const bool*, int64_t, int, const TArray&, const int*, int, int64_t*); +template cudaError_t NonZeroOutputPositions(cudaStream_t stream, const uint8_t*, int64_t, int, const TArray&, const int*, int, int64_t*); +template cudaError_t NonZeroOutputPositions(cudaStream_t stream, const int64_t*, int64_t, int, const TArray&, const int*, int, int64_t*); +template cudaError_t NonZeroOutputPositions(cudaStream_t stream, const int32_t*, int64_t, int, const TArray&, const int*, int, int64_t*); +template cudaError_t NonZeroOutputPositions(cudaStream_t stream, const float*, int64_t, int, const TArray&, const int*, int, int64_t*); +template cudaError_t NonZeroOutputPositions(cudaStream_t stream, const half*, int64_t, int, const TArray&, const int*, int, int64_t*); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/nonzero_impl.h b/onnxruntime/core/providers/cuda/tensor/nonzero_impl.h index 7d55e83133..dfbe433bd5 100644 --- a/onnxruntime/core/providers/cuda/tensor/nonzero_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/nonzero_impl.h @@ -10,19 +10,19 @@ namespace cuda { int NonZeroCalcBlockCount(int64_t x_size); -cudaError_t NonZeroCalcPrefixSumTempStorageBytes(int* prefix_counts, int number_of_blocks, size_t& ); +cudaError_t NonZeroCalcPrefixSumTempStorageBytes(cudaStream_t stream, int* prefix_counts, int number_of_blocks, size_t& ); -cudaError_t NonZeroInclusivePrefixSum(void* d_temp_storage, size_t temp_storage_bytes, int* prefix_counts, int number_of_blocks); +cudaError_t NonZeroInclusivePrefixSum(cudaStream_t stream, void* d_temp_storage, size_t temp_storage_bytes, int* prefix_counts, int number_of_blocks); // count nonzero elements in each block into counts_in_blocks, // the counts_in_blocks buffer is pre-allocated on gpu first. template -cudaError_t NonZeroCountEachBlock(const InputT* x, int64_t x_size, int* counts_in_blocks); +cudaError_t NonZeroCountEachBlock(cudaStream_t stream, const InputT* x, int64_t x_size, int* counts_in_blocks); // output nonzero positions using input x and prefix_counts for each blocks template cudaError_t NonZeroOutputPositions( - const InputT *x, int64_t x_size, int x_rank, const TArray& x_strides, + cudaStream_t stream, const InputT *x, int64_t x_size, int x_rank, const TArray& x_strides, const int* prefix_counts, int nonzero_elements, int64_t* results); } // namespace cuda diff --git a/onnxruntime/core/providers/cuda/tensor/nonzero_op.cc b/onnxruntime/core/providers/cuda/tensor/nonzero_op.cc index 67cab1e5df..992fe5dfab 100644 --- a/onnxruntime/core/providers/cuda/tensor/nonzero_op.cc +++ b/onnxruntime/core/providers/cuda/tensor/nonzero_op.cc @@ -66,17 +66,18 @@ Status NonZero::ComputeInternal(OpKernelContext* context) const { const int number_of_blocks = NonZeroCalcBlockCount(x_size); auto prefix_buffer = GetScratchBuffer(number_of_blocks); int* prefix_counts = prefix_buffer.get(); - CUDA_RETURN_IF_ERROR(NonZeroCountEachBlock(x_data, x_size, prefix_counts)); + CUDA_RETURN_IF_ERROR(NonZeroCountEachBlock(Stream(), x_data, x_size, prefix_counts)); size_t temp_storage_bytes = 0; - CUDA_RETURN_IF_ERROR(NonZeroCalcPrefixSumTempStorageBytes(prefix_counts, number_of_blocks, temp_storage_bytes)); + CUDA_RETURN_IF_ERROR(NonZeroCalcPrefixSumTempStorageBytes(Stream(), prefix_counts, number_of_blocks, temp_storage_bytes)); auto temp_buffer = GetScratchBuffer(temp_storage_bytes); auto d_temp_storage = temp_buffer.get(); - CUDA_RETURN_IF_ERROR(NonZeroInclusivePrefixSum(d_temp_storage, temp_storage_bytes, prefix_counts, number_of_blocks)); + CUDA_RETURN_IF_ERROR(NonZeroInclusivePrefixSum(Stream(), d_temp_storage, temp_storage_bytes, prefix_counts, number_of_blocks)); - CUDA_RETURN_IF_ERROR(cudaMemcpy( + // cudaMemcpyAsync from device memory to pageable host memory will return only once the copy has completed. + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync( &nonzero_elements, prefix_counts + number_of_blocks - 1, - sizeof(int), cudaMemcpyDeviceToHost)); + sizeof(int), cudaMemcpyDeviceToHost, Stream())); TArray fdm_x_strides(x_rank); TensorPitches x_strides(x_dims); @@ -87,7 +88,7 @@ Status NonZero::ComputeInternal(OpKernelContext* context) const { auto* output_tensor = context->Output(0, {x_rank, nonzero_elements}); ORT_ENFORCE(output_tensor, "failed to get first output!"); CUDA_RETURN_IF_ERROR(NonZeroOutputPositions( - x_data, x_size, x_rank, fdm_x_strides, + Stream(), x_data, x_size, x_rank, fdm_x_strides, prefix_counts, nonzero_elements, output_tensor->template MutableData())); } else { context->Output(0, {x_rank, nonzero_elements}); diff --git a/onnxruntime/core/providers/cuda/tensor/onehot.cc b/onnxruntime/core/providers/cuda/tensor/onehot.cc index de68c6b752..7847a2309b 100644 --- a/onnxruntime/core/providers/cuda/tensor/onehot.cc +++ b/onnxruntime/core/providers/cuda/tensor/onehot.cc @@ -66,8 +66,9 @@ Status OneHotOp::ComputeInternal(OpKernelContext* auto* output_data = reinterpret_cast(output->MutableData()); if (values_data[0] == CudaT_Out(0.f)) { - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output->MutableDataRaw(), 0, output->SizeInBytes())); - OneHotWithZeroOffValueImpl(indices_data, + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output->MutableDataRaw(), 0, output->SizeInBytes(), Stream())); + OneHotWithZeroOffValueImpl(Stream(), + indices_data, fdm_suffix, depth_val, values_data[1], @@ -77,7 +78,8 @@ Status OneHotOp::ComputeInternal(OpKernelContext* } const fast_divmod fdm_depth_suffix(gsl::narrow_cast(depth_val * suffix_dim_size)); - OneHotImpl(indices_data, fdm_depth_suffix, fdm_suffix, depth_val, + OneHotImpl(Stream(), + indices_data, fdm_depth_suffix, fdm_suffix, depth_val, values_data[1], values_data[0], output_data, diff --git a/onnxruntime/core/providers/cuda/tensor/onehot.cu b/onnxruntime/core/providers/cuda/tensor/onehot.cu index 88cf5576dc..1fb8dbe8b8 100644 --- a/onnxruntime/core/providers/cuda/tensor/onehot.cu +++ b/onnxruntime/core/providers/cuda/tensor/onehot.cu @@ -56,6 +56,7 @@ __global__ void _OneHotWithZeroOffValueImpl( template void OneHotImpl( + cudaStream_t stream, const in_type* indices_data, const fast_divmod fdm_depth_suffix, const fast_divmod fdm_suffix, @@ -66,7 +67,7 @@ void OneHotImpl( size_t count) { int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); - _OneHotImpl<<>>( + _OneHotImpl<<>>( indices_data, fdm_depth_suffix, fdm_suffix, @@ -79,6 +80,7 @@ void OneHotImpl( template void OneHotWithZeroOffValueImpl( + cudaStream_t stream, const in_type* indices_data, const fast_divmod fdm_suffix, const int64_t depth_val, @@ -87,7 +89,7 @@ void OneHotWithZeroOffValueImpl( size_t count) { int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); - _OneHotWithZeroOffValueImpl<<>>( + _OneHotWithZeroOffValueImpl<<>>( indices_data, fdm_suffix, depth_val, @@ -98,6 +100,7 @@ void OneHotWithZeroOffValueImpl( #define SPECIALIZED_OneHotImpl(in_type, out_type) \ template void OneHotImpl( \ + cudaStream_t stream, \ const in_type* indices_data, \ const fast_divmod fdm_depth_suffix, \ const fast_divmod fdm_suffix, \ @@ -115,6 +118,7 @@ SPECIALIZED_OneHotImpl(int32_t, half) #define SPECIALIZED_OneHotWithZeroOffValueImpl(in_type, out_type) \ template void OneHotWithZeroOffValueImpl( \ + cudaStream_t stream, \ const in_type* indices_data, \ const fast_divmod fdm_suffix, \ const int64_t depth_val, \ diff --git a/onnxruntime/core/providers/cuda/tensor/onehot.h b/onnxruntime/core/providers/cuda/tensor/onehot.h index 55d7a961e0..fff0acd1f1 100644 --- a/onnxruntime/core/providers/cuda/tensor/onehot.h +++ b/onnxruntime/core/providers/cuda/tensor/onehot.h @@ -11,6 +11,7 @@ namespace cuda { template void OneHotImpl( + cudaStream_t stream, const in_type* indices, const fast_divmod fdm_depth_suffix, const fast_divmod fdm_suffix, @@ -22,6 +23,7 @@ void OneHotImpl( template void OneHotWithZeroOffValueImpl( + cudaStream_t stream, const in_type* indices, const fast_divmod fdm_suffix, const int64_t depth_val, diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc index e870306c31..2e344ebb7e 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad.cc +++ b/onnxruntime/core/providers/cuda/tensor/pad.cc @@ -123,7 +123,7 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { CUDA_RETURN_IF_ERROR(cudaMemcpyAsync( output_tensor.template MutableData(), input_tensor.template Data(), sizeof(typename ToCudaType::MappedType) * output_shape.Size(), - cudaMemcpyDeviceToDevice, 0)); + cudaMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -134,6 +134,7 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { } PadImpl( + Stream(), dimension_count, input_dims, input_strides, diff --git a/onnxruntime/core/providers/cuda/tensor/pad_impl.cu b/onnxruntime/core/providers/cuda/tensor/pad_impl.cu index 400189b535..2e1820f198 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/pad_impl.cu @@ -69,6 +69,7 @@ __global__ void _PadKernel( template void PadImpl( + cudaStream_t stream, const size_t shape_rank, const TArray& input_dims, const TArray& input_strides, @@ -86,17 +87,17 @@ void PadImpl( int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); switch (pad_mode) { case 0: - _PadKernel<<>>( + _PadKernel<<>>( shape_rank, input_dims, input_strides, lower_pads, upper_pads, pad_value, input_data, fdm_output_strides, output_data, N); break; case 1: - _PadKernel<<>>( + _PadKernel<<>>( shape_rank, input_dims, input_strides, lower_pads, upper_pads, pad_value, input_data, fdm_output_strides, output_data, N); break; case 2: - _PadKernel<<>>( + _PadKernel<<>>( shape_rank, input_dims, input_strides, lower_pads, upper_pads, pad_value, input_data, fdm_output_strides, output_data, N); break; @@ -104,7 +105,7 @@ void PadImpl( } #define SPECIALIZED_IMPL(T) \ - template void PadImpl(const size_t shape_rank, const TArray& input_dims, const TArray& input_strides, const TArray& lower_pads, const TArray& upper_pads, const T pad_value, const int pad_mode, const T* input_data, const TArray& fdm_output_strides, T* output_data, const size_t N); + template void PadImpl(cudaStream_t stream, const size_t shape_rank, const TArray& input_dims, const TArray& input_strides, const TArray& lower_pads, const TArray& upper_pads, const T pad_value, const int pad_mode, const T* input_data, const TArray& fdm_output_strides, T* output_data, const size_t N); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/core/providers/cuda/tensor/pad_impl.h b/onnxruntime/core/providers/cuda/tensor/pad_impl.h index 68365512d8..8be69dcb1f 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/pad_impl.h @@ -10,6 +10,7 @@ namespace cuda { template void PadImpl( + cudaStream_t stream, const size_t shape_rank, const TArray& input_dims, const TArray& input_strides, diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc index 3f321fadb0..17fafa0af5 100644 --- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc +++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc @@ -32,7 +32,7 @@ Status QuantizeLinear::ComputeInternal(OpKernelContext* ctx) const { const CudaU* scale = reinterpret_cast(y_scale.template Data()); const auto num_of_elements = x_shape.Size(); - CudaQuantizeLinear(input, output, scale, zero_point, num_of_elements); + CudaQuantizeLinear(Stream(), input, output, scale, zero_point, num_of_elements); return Status::OK(); } @@ -59,7 +59,7 @@ Status DequantizeLinear::ComputeInternal(OpKernelContext* ctx) const { const CudaU* scale = reinterpret_cast(y_scale.template Data()); const auto num_of_elements = x_shape.Size(); - CudaDequantizeLinear(input, output, scale, zero_point, num_of_elements); + CudaDequantizeLinear(Stream(), input, output, scale, zero_point, num_of_elements); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu index 8f31ea9e01..ff300e4bda 100644 --- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu +++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu @@ -44,12 +44,12 @@ __global__ void QuantizeLinearKernel(const InT* input, OutT* output, const InT* } template -Status CudaQuantizeLinear(const InT* input, OutT* output, const InT* scale, const OutT* zero_point, size_t num_of_element) { +Status CudaQuantizeLinear(cudaStream_t stream, const InT* input, OutT* output, const InT* scale, const OutT* zero_point, size_t num_of_element) { if (num_of_element <= 0) return Status::OK(); int blocksPerGrid = static_cast(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); - QuantizeLinearKernel<<>>( + QuantizeLinearKernel<<>>( input, output, scale, @@ -75,12 +75,12 @@ __global__ void DequantizeLinearKernel(const InT* input, OutT* output, const Out } template -Status CudaDequantizeLinear(const InT* input, OutT* output, const OutT* scale, const InT* zero_point, size_t num_of_element) { +Status CudaDequantizeLinear(cudaStream_t stream, const InT* input, OutT* output, const OutT* scale, const InT* zero_point, size_t num_of_element) { if (num_of_element <= 0) return Status::OK(); int blocksPerGrid = static_cast(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); - DequantizeLinearKernel<<>>( + DequantizeLinearKernel<<>>( input, output, scale, @@ -89,15 +89,15 @@ Status CudaDequantizeLinear(const InT* input, OutT* output, const OutT* scale, c return Status::OK(); } -template Status CudaQuantizeLinear(const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element); -template Status CudaQuantizeLinear(const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element); -template Status CudaQuantizeLinear(const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element); -template Status CudaQuantizeLinear(const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element); +template Status CudaQuantizeLinear(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element); +template Status CudaQuantizeLinear(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element); +template Status CudaQuantizeLinear(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element); +template Status CudaQuantizeLinear(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element); -template Status CudaDequantizeLinear(const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element); -template Status CudaDequantizeLinear(const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element); -template Status CudaDequantizeLinear(const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element); -template Status CudaDequantizeLinear(const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element); +template Status CudaDequantizeLinear(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element); +template Status CudaDequantizeLinear(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element); +template Status CudaDequantizeLinear(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element); +template Status CudaDequantizeLinear(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh index 5d140981d6..b6773de316 100644 --- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh +++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh @@ -12,10 +12,10 @@ namespace onnxruntime { namespace cuda { template -Status CudaQuantizeLinear(const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element); +Status CudaQuantizeLinear(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element); template -Status CudaDequantizeLinear(const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element); +Status CudaDequantizeLinear(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu index c93e8accd2..36d138f107 100644 --- a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu @@ -42,23 +42,29 @@ __device__ CudaFunctionNearestPixel func_NearestPixel_ROUND_PREFER_CEIL = Neares __device__ CudaFunctionNearestPixel func_NearestPixel_FLOOR = NearestPixel_FLOOR; __device__ CudaFunctionNearestPixel func_NearestPixel_CEIL = NearestPixel_CEIL; -CudaFunctionNearestPixel GetDeviceNearstPixelFunction(ResizeNearestMode nearest_mode) { +CudaFunctionNearestPixel GetDeviceNearstPixelFunction(cudaStream_t stream, ResizeNearestMode nearest_mode) { static bool already_copied = false; static std::mutex s_mutext; static CudaFunctionNearestPixel s_nearest_pixel[ResizeNearestMode::NearestModeCount]; if (!already_copied) { std::lock_guard lock(s_mutext); if (!already_copied) { - CUDA_CALL(cudaMemcpyFromSymbol(&s_nearest_pixel[ResizeNearestMode::SIMPLE], - func_NearestPixel_SIMPLE, sizeof(CudaFunctionNearestPixel))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_nearest_pixel[ResizeNearestMode::ROUND_PREFER_FLOOR], - func_NearestPixel_ROUND_PREFER_FLOOR, sizeof(CudaFunctionNearestPixel))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_nearest_pixel[ResizeNearestMode::ROUND_PREFER_CEIL], - func_NearestPixel_ROUND_PREFER_CEIL, sizeof(CudaFunctionNearestPixel))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_nearest_pixel[ResizeNearestMode::FLOOR], - func_NearestPixel_FLOOR, sizeof(CudaFunctionNearestPixel))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_nearest_pixel[ResizeNearestMode::CEIL], - func_NearestPixel_CEIL, sizeof(CudaFunctionNearestPixel))); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_nearest_pixel[ResizeNearestMode::SIMPLE], + func_NearestPixel_SIMPLE, sizeof(CudaFunctionNearestPixel), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_nearest_pixel[ResizeNearestMode::ROUND_PREFER_FLOOR], + func_NearestPixel_ROUND_PREFER_FLOOR, sizeof(CudaFunctionNearestPixel), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_nearest_pixel[ResizeNearestMode::ROUND_PREFER_CEIL], + func_NearestPixel_ROUND_PREFER_CEIL, sizeof(CudaFunctionNearestPixel), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_nearest_pixel[ResizeNearestMode::FLOOR], + func_NearestPixel_FLOOR, sizeof(CudaFunctionNearestPixel), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_nearest_pixel[ResizeNearestMode::CEIL], + func_NearestPixel_CEIL, sizeof(CudaFunctionNearestPixel), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaStreamSynchronize(stream)); already_copied = true; } } @@ -105,25 +111,32 @@ __device__ CudaFunctionOriginalCoordinate func_TransformCoordinate_ALIGN_CORNERS __device__ CudaFunctionOriginalCoordinate func_TransformCoordinate_TF_HALF_PIXEL_FOR_NN = TransformCoordinate_TF_HALF_PIXEL_FOR_NN; __device__ CudaFunctionOriginalCoordinate func_TransformCoordinate_TF_CROP_AND_RESIZE = TransformCoordinate_TF_CROP_AND_RESIZE; -CudaFunctionOriginalCoordinate GetDeviceOriginalCoordinateFunc(ResizeCoordinateTransformationMode coordinate_transform_mode) { +CudaFunctionOriginalCoordinate GetDeviceOriginalCoordinateFunc(cudaStream_t stream, ResizeCoordinateTransformationMode coordinate_transform_mode) { static bool already_copied = false; static std::mutex s_mutext; static CudaFunctionOriginalCoordinate s_coordinate_tranforms[ResizeCoordinateTransformationMode::CoordinateTransformationModeCount]; if (!already_copied) { std::lock_guard lock(s_mutext); if (!already_copied) { - CUDA_CALL(cudaMemcpyFromSymbol(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::HALF_PIXEL], - func_TransformCoordinate_HALF_PIXEL, sizeof(CudaFunctionOriginalCoordinate))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::ASYMMETRIC], - func_TransformCoordinate_ASYMMETRIC, sizeof(CudaFunctionOriginalCoordinate))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::PYTORCH_HALF_PIXEL], - func_TransformCoordinate_PYTORCH_HALF_PIXEL, sizeof(CudaFunctionOriginalCoordinate))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::ALIGN_CORNERS], - func_TransformCoordinate_ALIGN_CORNERS, sizeof(CudaFunctionOriginalCoordinate))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::TF_HALF_PIXEL_FOR_NN], - func_TransformCoordinate_TF_HALF_PIXEL_FOR_NN, sizeof(CudaFunctionOriginalCoordinate))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE], - func_TransformCoordinate_TF_CROP_AND_RESIZE, sizeof(CudaFunctionOriginalCoordinate))); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::HALF_PIXEL], + func_TransformCoordinate_HALF_PIXEL, sizeof(CudaFunctionOriginalCoordinate), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::ASYMMETRIC], + func_TransformCoordinate_ASYMMETRIC, sizeof(CudaFunctionOriginalCoordinate), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::PYTORCH_HALF_PIXEL], + func_TransformCoordinate_PYTORCH_HALF_PIXEL, sizeof(CudaFunctionOriginalCoordinate), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::ALIGN_CORNERS], + func_TransformCoordinate_ALIGN_CORNERS, sizeof(CudaFunctionOriginalCoordinate), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::TF_HALF_PIXEL_FOR_NN], + func_TransformCoordinate_TF_HALF_PIXEL_FOR_NN, sizeof(CudaFunctionOriginalCoordinate), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE], + func_TransformCoordinate_TF_CROP_AND_RESIZE, sizeof(CudaFunctionOriginalCoordinate), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaStreamSynchronize(stream)); already_copied = true; } } @@ -591,6 +604,7 @@ size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode, template void ResizeNearestImpl( + cudaStream_t stream, const int rank, TArray& input_shape, TArray& output_shape, @@ -611,7 +625,7 @@ void ResizeNearestImpl( unsigned int blocksPerGrid = static_cast(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); bool could2d = rank >= 2 && - transform_coordinate != GetDeviceOriginalCoordinateFunc(ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE) && + transform_coordinate != GetDeviceOriginalCoordinateFunc(stream, ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE) && std::all_of(scales_vals.Data(), scales_vals.Data() + (rank - 2), [](float v) { return v == 1.0; }); if (could2d) { int64_t output_height = output_shape[rank - 2]; @@ -619,7 +633,7 @@ void ResizeNearestImpl( fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 3] : fast_divmod(static_cast(output_height * output_width)); int blocksPerDimsMappingGrid = static_cast(ceil((output_height + output_width) / 32.0)); - _ResizeNearestMappingKernel2D<<>>( + _ResizeNearestMappingKernel2D<<>>( static_cast(input_shape[rank - 2]), static_cast(input_shape[rank - 1]), static_cast(output_height), static_cast(output_width), scales_vals[rank - 2], scales_vals[rank - 1], @@ -628,7 +642,7 @@ void ResizeNearestImpl( extrapolation_enabled, transform_coordinate, calc_nearest_pixel, dims_mapping); if (extrapolation_enabled) { - _ResizeNearestKernel2D<<>>( + _ResizeNearestKernel2D<<>>( output_height, output_width, input_shape[rank - 2] * input_shape[rank - 1], static_cast(input_shape[rank - 1]), div_output_image, output_div_pitches[rank - 2], @@ -636,7 +650,7 @@ void ResizeNearestImpl( extrapolation_value, dims_mapping); } else { - _ResizeNearestKernel2D<<>>( + _ResizeNearestKernel2D<<>>( output_height, output_width, input_shape[rank - 2] * input_shape[rank - 1], static_cast(input_shape[rank - 1]), div_output_image, output_div_pitches[rank - 2], @@ -649,14 +663,14 @@ void ResizeNearestImpl( int64_t total_dim_sum = std::accumulate(output_shape.Data(), output_shape.Data() + rank, (int64_t)0); int blocksPerDimsMappingGrid = (int)(ceil(static_cast(total_dim_sum) / 32)); - _ResizeNearestMappingKernel<<>>( + _ResizeNearestMappingKernel<<>>( rank, input_shape, output_shape, scales_vals, roi_vals, total_dim_sum, extrapolation_enabled, transform_coordinate, calc_nearest_pixel, reinterpret_cast(dims_mapping), reinterpret_cast(reinterpret_cast(dims_mapping) + rank)); - _ResizeNearestKernel<<>>( + _ResizeNearestKernel<<>>( rank, input_strides, output_div_pitches, input_data, output_data, N, extrapolation_value, @@ -667,6 +681,7 @@ void ResizeNearestImpl( template void ResizeImpl( + cudaStream_t stream, const UpsampleMode upsample_mode, const int rank, TArray& input_shape, @@ -688,15 +703,15 @@ void ResizeImpl( bool isSame = std::all_of(scales_vals.Data(), scales_vals.Data() + rank, [](float v) { return v == 1.0f; }) && (coordinate_transform_mode != ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE); if (isSame) { - cudaMemcpyAsync(output_data, input_data, N * sizeof(T), cudaMemcpyDeviceToDevice); + cudaMemcpyAsync(output_data, input_data, N * sizeof(T), cudaMemcpyDeviceToDevice, stream); return; } - CudaFunctionOriginalCoordinate transform_coordinate = GetDeviceOriginalCoordinateFunc(coordinate_transform_mode); - CudaFunctionNearestPixel calc_nearest_pixel = GetDeviceNearstPixelFunction(nearest_mode); + CudaFunctionOriginalCoordinate transform_coordinate = GetDeviceOriginalCoordinateFunc(stream, coordinate_transform_mode); + CudaFunctionNearestPixel calc_nearest_pixel = GetDeviceNearstPixelFunction(stream, nearest_mode); if (upsample_mode == UpsampleMode::NN) { ResizeNearestImpl( - rank, input_shape, output_shape, input_strides, output_div_pitches, + stream, rank, input_shape, output_shape, input_strides, output_div_pitches, scales_vals, roi_vals, input_data, output_data, N, extrapolation_enabled, extrapolation_value, cubic_coeff_a, transform_coordinate, calc_nearest_pixel, @@ -734,7 +749,7 @@ void ResizeImpl( switch (upsample_mode) { case UpsampleMode::LINEAR: if (is_2D) { - _ResizeBilinearCoordinateMapping<<>>( + _ResizeBilinearCoordinateMapping<<>>( input_shape[rank - 2], input_shape[rank - 1], output_height, output_width, scales_vals[rank - 2], scales_vals[rank - 1], @@ -742,7 +757,7 @@ void ResizeImpl( roi_vals[rank - 1], roi_vals[rank - 1 + rank], output_height + output_width, extrapolation_enabled, transform_coordinate, reinterpret_cast(dims_mapping)); - _ResizeBilinearKernel<<>>( + _ResizeBilinearKernel<<>>( input_shape[rank - 2], input_shape[rank - 1], output_height, output_width, output_div_pitches[rank - 2], div_output_image, @@ -750,7 +765,7 @@ void ResizeImpl( reinterpret_cast(dims_mapping)); return; } else if (is_3D) { - _ResizeTrilinearCoordinateMapping<<>>( + _ResizeTrilinearCoordinateMapping<<>>( input_shape[rank - 3] , input_shape[rank - 2], input_shape[rank - 1], output_depth, output_height, output_width, scales_vals[rank - 3], scales_vals[rank - 2], scales_vals[rank - 1], @@ -759,7 +774,7 @@ void ResizeImpl( roi_vals[rank - 1], roi_vals[rank - 1 + rank], output_depth + output_height + output_width, extrapolation_enabled, transform_coordinate, reinterpret_cast(dims_mapping)); - _ResizeTrilinearKernel<<>>( + _ResizeTrilinearKernel<<>>( input_shape[rank - 3], input_shape[rank - 2], input_shape[rank - 1], output_depth, output_height, output_width, output_div_pitches[rank - 3], output_div_pitches[rank - 2], div_output_image, @@ -772,7 +787,7 @@ void ResizeImpl( case UpsampleMode::CUBIC: if (is_2D) { - _ResizeCubicCoordinateMapping<<>>( + _ResizeCubicCoordinateMapping<<>>( input_shape[rank - 2], input_shape[rank - 1], output_height, output_width, scales_vals[rank - 2], scales_vals[rank - 1], @@ -781,7 +796,7 @@ void ResizeImpl( output_height + output_width, extrapolation_enabled, cubic_coeff_a, exclude_outside, transform_coordinate, reinterpret_cast(dims_mapping)); - _ResizeBiCubicKernel<<>>( + _ResizeBiCubicKernel<<>>( input_shape[rank - 2], input_shape[rank - 1], output_height, output_width, output_div_pitches[rank - 2], div_output_image, @@ -794,6 +809,7 @@ void ResizeImpl( #define SPECIALIZED_IMPL(T) \ template void ResizeImpl( \ + cudaStream_t stream, \ const UpsampleMode upsample_mode, \ const int rank, \ TArray& input_shape, \ diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.h b/onnxruntime/core/providers/cuda/tensor/resize_impl.h index c82616d644..c2359c260c 100644 --- a/onnxruntime/core/providers/cuda/tensor/resize_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.h @@ -16,6 +16,7 @@ size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode, template void ResizeImpl( + cudaStream_t stream, const onnxruntime::UpsampleMode upsample_mode, const int rank, TArray& input_shape, diff --git a/onnxruntime/core/providers/cuda/tensor/reverse_sequence.cc b/onnxruntime/core/providers/cuda/tensor/reverse_sequence.cc index f51d99c549..7f70c57503 100644 --- a/onnxruntime/core/providers/cuda/tensor/reverse_sequence.cc +++ b/onnxruntime/core/providers/cuda/tensor/reverse_sequence.cc @@ -20,8 +20,9 @@ ONNX_OPERATOR_KERNEL_EX( ReverseSequenceOp); #define ReverseSequenceCallCudaImplTypeAs(T, TEqual) \ - if (X.IsDataType()) { \ + if (X.IsDataType()) { \ CUDA_RETURN_IF_ERROR(ReverseSequenceCudaImpl( \ + Stream(), \ reinterpret_cast::MappedType*>(X.template Data()), \ seq_lengths.Data(), \ reinterpret_cast::MappedType*>(Y.template MutableData()), \ diff --git a/onnxruntime/core/providers/cuda/tensor/reverse_sequence_impl.cu b/onnxruntime/core/providers/cuda/tensor/reverse_sequence_impl.cu index b7de4c3323..4d37b6a206 100644 --- a/onnxruntime/core/providers/cuda/tensor/reverse_sequence_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/reverse_sequence_impl.cu @@ -52,6 +52,7 @@ __global__ void ReverseSequenceImplKernel( template cudaError_t ReverseSequenceCudaImpl( + cudaStream_t stream, const T* x_data, const int64_t* seq_len_data, T* y_data, @@ -66,11 +67,11 @@ cudaError_t ReverseSequenceCudaImpl( int blocksPerGrid = CeilDiv(group_count, GridDim::maxThreadsPerBlock); if (time_major) { - ReverseSequenceImplKernel<<>>( + ReverseSequenceImplKernel<<>>( x_data, seq_len_data, y_data, batch_size, max_seq_len, element_size, group_count, fdm_grouped_stride_0, fdm_grouped_stride_1); } else { - ReverseSequenceImplKernel<<>>( + ReverseSequenceImplKernel<<>>( x_data, seq_len_data, y_data, batch_size, max_seq_len, element_size, group_count, fdm_grouped_stride_0, fdm_grouped_stride_1); } @@ -79,6 +80,7 @@ cudaError_t ReverseSequenceCudaImpl( #define InstantiateReverseSequenceImpl(T) \ template cudaError_t ReverseSequenceCudaImpl( \ + cudaStream_t stream, \ const T* x_data, \ const int64_t* seq_len_data, \ T* y_data, \ diff --git a/onnxruntime/core/providers/cuda/tensor/reverse_sequence_impl.h b/onnxruntime/core/providers/cuda/tensor/reverse_sequence_impl.h index bc0973b50d..15268be59e 100644 --- a/onnxruntime/core/providers/cuda/tensor/reverse_sequence_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/reverse_sequence_impl.h @@ -11,6 +11,7 @@ namespace cuda { template cudaError_t ReverseSequenceCudaImpl( + cudaStream_t stream, const T* x_data, const int64_t* seq_len_data, T* y_data, diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_elements.cc b/onnxruntime/core/providers/cuda/tensor/scatter_elements.cc index 5a31a472aa..ab04fb150a 100755 --- a/onnxruntime/core/providers/cuda/tensor/scatter_elements.cc +++ b/onnxruntime/core/providers/cuda/tensor/scatter_elements.cc @@ -49,7 +49,8 @@ ONNX_OPERATOR_KERNEL_EX( template struct ScatterElements::ComputeImpl { - Status operator()(const Tensor* data_tensor, + Status operator()(cudaStream_t stream, + const Tensor* data_tensor, const Tensor* updates_tensor, const Tensor* indices_tensor, Tensor* output_tensor, @@ -69,6 +70,7 @@ struct ScatterElements::ComputeImpl { if (utils::IsPrimitiveDataType(Tin_type)) { const int32_t* indices_data = indices_tensor->template Data(); return ScatterElementsImpl( + stream, rank, reinterpret_cast(input_data), input_data_size, @@ -84,6 +86,7 @@ struct ScatterElements::ComputeImpl { } else if (utils::IsPrimitiveDataType(Tin_type)) { const int64_t* indices_data = indices_tensor->template Data(); return ScatterElementsImpl( + stream, rank, reinterpret_cast(input_data), input_data_size, @@ -163,7 +166,7 @@ Status ScatterElements::ComputeInternal(OpKernelContext* context) const { utils::MLTypeCallDispatcherRet t_disp(data_tensor->GetElementType()); - return t_disp.Invoke(data_tensor, updates_tensor, indices_tensor, output_tensor, rank, + return t_disp.Invoke(Stream(), data_tensor, updates_tensor, indices_tensor, output_tensor, rank, input_data_size, buffer_input_dims, buffer_input_strides, indices_size, buffer_indices_dims, fdm_indices_strides, axis); } diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.cu b/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.cu index 03e4d6afbf..c4536ba112 100755 --- a/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.cu @@ -145,6 +145,7 @@ static int CompactInputIndicesDims( template Status ScatterElementsImpl2D( + cudaStream_t stream, const T* input_data, const std::vector& input_dims, const Tin* indices_data, @@ -157,12 +158,12 @@ Status ScatterElementsImpl2D( int blocksPerGrid = gsl::narrow_cast(CeilDiv(indices_size, GridDim::maxThreadsPerBlock)); fast_divmod indices_stride_row(static_cast(indices_dims[1])); if (axis == 0) { - _ScatterElementsKernel2D<<>>( + _ScatterElementsKernel2D<<>>( gsl::narrow_cast(input_dims[0]), input_data, indices_data, indices_size, indices_stride_row, updates, input_dims[1], output_data, func); } else { - _ScatterElementsKernel2D<<>>( + _ScatterElementsKernel2D<<>>( gsl::narrow_cast(input_dims[1]), input_data, indices_data, indices_size, indices_stride_row, updates, input_dims[1], output_data, func); @@ -172,6 +173,7 @@ Status ScatterElementsImpl2D( template Status ScatterElementsImplInternal( + cudaStream_t stream, const int rank, const T* input_data, const int64_t input_size, @@ -186,7 +188,7 @@ Status ScatterElementsImplInternal( T* output_data, const FuncT& func) { if (input_data != output_data) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, input_size * sizeof(T), cudaMemcpyDeviceToDevice, 0)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, input_size * sizeof(T), cudaMemcpyDeviceToDevice, stream)); } if (indices_size > 0) { @@ -196,12 +198,12 @@ Status ScatterElementsImplInternal( rank, axis, buffer_input_dims.Data(), buffer_indices_dims.Data(), eff_input_dims, eff_indices_dims); if (eff_input_dims.size() == 2) { return ScatterElementsImpl2D( - input_data, eff_input_dims, indices_data, indices_size, eff_indices_dims, updates, new_axis, output_data, + stream, input_data, eff_input_dims, indices_data, indices_size, eff_indices_dims, updates, new_axis, output_data, func); } int blocksPerGrid = gsl::narrow_cast(CeilDiv(indices_size, GridDim::maxThreadsPerBlock)); - _ScatterElementsKernel<<>>( + _ScatterElementsKernel<<>>( rank, input_data, buffer_input_dims, buffer_input_strides, indices_data, indices_size, buffer_indices_dims, fdm_indices_strides, updates, axis, output_data, func); @@ -218,6 +220,7 @@ struct Func_Assignment { template Status ScatterElementsImpl( + cudaStream_t stream, const int rank, const T* input_data, const int64_t input_size, @@ -230,13 +233,14 @@ Status ScatterElementsImpl( const T* updates, const int axis, T* output_data) { - return ScatterElementsImplInternal(rank, input_data, input_size, buffer_input_dims, + return ScatterElementsImplInternal(stream, rank, input_data, input_size, buffer_input_dims, buffer_input_strides, indices_data, indices_size, buffer_indices_dims, fdm_indices_strides, updates, axis, output_data, Func_Assignment()); } #define SCATTER_ELEMENTS_SPECIALIZED_TINDEX_IMPL(T, TIndex) \ template Status ScatterElementsImpl( \ + cudaStream_t stream, \ const int rank, \ const T* input_data, \ const int64_t input_size, \ @@ -278,6 +282,7 @@ struct Func_AtomicAdd { template Status GatherElementsGradImpl( + cudaStream_t stream, const int rank, TArray& buffer_input_dims, TArray& buffer_input_strides, @@ -290,7 +295,7 @@ Status GatherElementsGradImpl( T* output_data) { // Give output_data as the input_data parameter by intention, // to skip input_data copy, which is not applicable for GatherElementsGrad. - return ScatterElementsImplInternal(rank, output_data, 0, + return ScatterElementsImplInternal(stream, rank, output_data, 0, buffer_input_dims, buffer_input_strides, indices_data, indices_size, buffer_indices_dims, fdm_indices_strides, updates, axis, output_data, Func_AtomicAdd()); @@ -298,6 +303,7 @@ Status GatherElementsGradImpl( #define GATHER_ELEMENTS_GRAD_SPECIALIZED_TINDEX_IMPL(T, TIndex) \ template Status GatherElementsGradImpl( \ + cudaStream_t stream, \ const int rank, \ TArray& buffer_input_dims, \ TArray& buffer_input_strides, \ diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.h b/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.h index 5eea6ab808..8f4e676042 100755 --- a/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.h @@ -11,6 +11,7 @@ namespace cuda { template Status ScatterElementsImpl( + cudaStream_t stream, const int rank, const T* input_data, const int64_t input_size, diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_nd.cc b/onnxruntime/core/providers/cuda/tensor/scatter_nd.cc index 07dd5df43b..d5f632a1bf 100644 --- a/onnxruntime/core/providers/cuda/tensor/scatter_nd.cc +++ b/onnxruntime/core/providers/cuda/tensor/scatter_nd.cc @@ -48,7 +48,7 @@ Status ScatterND::ComputeInternal(OpKernelContext* context) const { if (input_data != output_data) { // TODO: Run benchmarks to determine if a dedicated kernel doing data copy will be faster than invoking cudaMemcpy ? - cudaMemcpyAsync(output_data, input_data, input_tensor->SizeInBytes(), cudaMemcpyDeviceToDevice); + cudaMemcpyAsync(output_data, input_data, input_tensor->SizeInBytes(), cudaMemcpyDeviceToDevice, Stream()); } // Bail out early @@ -71,6 +71,7 @@ Status ScatterND::ComputeInternal(OpKernelContext* context) const { element_counts_and_input_dims_gpu.CopyToGpu(); ORT_RETURN_IF_ERROR(ScatterNDImpl( + Stream(), output_data, element_size, indices_shape.Size() / static_cast(last_index_dimension), diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.cu b/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.cu index 213e8d9ed2..0651049a5f 100644 --- a/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.cu @@ -52,6 +52,7 @@ __global__ void _ScatterNDKernel( } Status ScatterNDImpl( + cudaStream_t stream, void* output_data, const size_t element_size, const size_t num_indices, @@ -68,7 +69,7 @@ Status ScatterNDImpl( switch (element_size) { case sizeof(int8_t): - _ScatterNDKernel<<>>( + _ScatterNDKernel<<>>( reinterpret_cast(output_data), num_indices, indices_data, @@ -79,7 +80,7 @@ Status ScatterNDImpl( break; case sizeof(int16_t): - _ScatterNDKernel<<>>( + _ScatterNDKernel<<>>( reinterpret_cast(output_data), num_indices, indices_data, @@ -90,7 +91,7 @@ Status ScatterNDImpl( break; case sizeof(int32_t): - _ScatterNDKernel<<>>( + _ScatterNDKernel<<>>( reinterpret_cast(output_data), num_indices, indices_data, @@ -101,7 +102,7 @@ Status ScatterNDImpl( break; case sizeof(int64_t): - _ScatterNDKernel<<>>( + _ScatterNDKernel<<>>( reinterpret_cast(output_data), num_indices, indices_data, diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.h b/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.h index de9bad886d..874d275f94 100644 --- a/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.h @@ -9,6 +9,7 @@ namespace onnxruntime { namespace cuda { Status ScatterNDImpl( + cudaStream_t stream, void* output_data, const size_t element_size, const size_t num_indices, diff --git a/onnxruntime/core/providers/cuda/tensor/slice.cc b/onnxruntime/core/providers/cuda/tensor/slice.cc index d23a686c6f..cf4d7ad75c 100644 --- a/onnxruntime/core/providers/cuda/tensor/slice.cc +++ b/onnxruntime/core/providers/cuda/tensor/slice.cc @@ -83,7 +83,8 @@ REGISTER_V13_TYPED_SLICE(int32_t) REGISTER_V13_TYPED_SLICE(int64_t) REGISTER_V13_TYPED_SLICE(float) -static Status SliceImpCore(const void* input_data, void* output_data, +static Status SliceImpCore(cudaStream_t stream, + const void* input_data, void* output_data, size_t element_size, size_t dimension_count, const TArray& starts_buffer, const TArray& steps_buffer, const TArray& input_strides, const TArray& output_strides, @@ -92,7 +93,8 @@ static Status SliceImpCore(const void* input_data, void* output_data, return Status::OK(); } - return SliceImpl(element_size, + return SliceImpl(stream, + element_size, gsl::narrow_cast(dimension_count), starts_buffer, steps_buffer, @@ -146,7 +148,8 @@ static Status ComputeSliceStrides(const TensorShape& input_shape, return Status::OK(); } -Status Impl(const void* input_data, +Status Impl(cudaStream_t stream, + const void* input_data, const TensorShape& input_shape, void* output_data, SliceOp::PrepareForComputeMetadata& compute_metadata, @@ -163,7 +166,8 @@ Status Impl(const void* input_data, TensorShape output_shape(compute_metadata.output_dims_); - ORT_RETURN_IF_ERROR(SliceImpCore(input_data, + ORT_RETURN_IF_ERROR(SliceImpCore(stream, + input_data, output_data, element_size, gsl::narrow_cast(dimension_count), @@ -237,7 +241,8 @@ Status Slice::CallSliceImp(size_t element_size, size_t dimension_count, const auto* input_tensor = ctx->Input(0); auto* output_tensor = ctx->Output(0, output_shape); - return SliceImpCore(input_tensor->DataRaw(), + return SliceImpCore(Stream(), + input_tensor->DataRaw(), output_tensor->MutableDataRaw(), element_size, gsl::narrow_cast(dimension_count), diff --git a/onnxruntime/core/providers/cuda/tensor/slice.h b/onnxruntime/core/providers/cuda/tensor/slice.h index 8bbd5158b3..b43cbfee78 100644 --- a/onnxruntime/core/providers/cuda/tensor/slice.h +++ b/onnxruntime/core/providers/cuda/tensor/slice.h @@ -11,7 +11,8 @@ namespace cuda { namespace SliceCuda { -Status Impl(const void* input_data, +Status Impl(cudaStream_t stream, + const void* input_data, const TensorShape& input_shape, void* output_data, SliceOp::PrepareForComputeMetadata& prepare_metadata, diff --git a/onnxruntime/core/providers/cuda/tensor/slice_impl.cu b/onnxruntime/core/providers/cuda/tensor/slice_impl.cu index 5a74018852..f8d8a75ed4 100644 --- a/onnxruntime/core/providers/cuda/tensor/slice_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/slice_impl.cu @@ -61,7 +61,8 @@ __global__ void _SliceKernel(const TArray starts, } } -Status SliceImpl(const size_t element_size, +Status SliceImpl(cudaStream_t stream, + const size_t element_size, const int32_t dimension_count, const TArray& starts, const TArray& steps, @@ -70,11 +71,12 @@ Status SliceImpl(const size_t element_size, const void* input_data, void* output_data, const size_t N) { - return SliceImplEx(element_size, dimension_count, starts, steps, input_strides, output_strides, input_data, + return SliceImplEx(stream, element_size, dimension_count, starts, steps, input_strides, output_strides, input_data, output_data, N); } -Status SliceImplGrad(const size_t element_size, +Status SliceImplGrad(cudaStream_t stream, + const size_t element_size, const int32_t dimension_count, const TArray& starts, const TArray& steps, @@ -83,14 +85,14 @@ Status SliceImplGrad(const size_t element_size, const void* input_data, void* output_data, const size_t N) { - return SliceImplEx(element_size, dimension_count, starts, steps, input_strides, output_strides, input_data, + return SliceImplEx(stream, element_size, dimension_count, starts, steps, input_strides, output_strides, input_data, output_data, N); } #define HANDLE_DIMS(ELEMENT_TYPE, DIMS) \ case DIMS: { \ _SliceKernel \ - <<>>( \ + <<>>( \ starts, steps, input_strides, output_strides, \ reinterpret_cast::MappedType*>(input_data), \ reinterpret_cast::MappedType*>(output_data), \ @@ -112,7 +114,8 @@ Status SliceImplGrad(const size_t element_size, } break template -Status SliceImplEx(const size_t element_size, +Status SliceImplEx(cudaStream_t stream, + const size_t element_size, const int32_t dimension_count, const TArray& starts, const TArray& steps, diff --git a/onnxruntime/core/providers/cuda/tensor/slice_impl.h b/onnxruntime/core/providers/cuda/tensor/slice_impl.h index 33c6ae3e4c..d691d60f81 100644 --- a/onnxruntime/core/providers/cuda/tensor/slice_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/slice_impl.h @@ -9,7 +9,8 @@ namespace onnxruntime { namespace cuda { template -Status SliceImplEx(const size_t element_size, +Status SliceImplEx(cudaStream_t stream, + const size_t element_size, const int32_t dimension_count, const TArray& starts, const TArray& steps, @@ -19,7 +20,8 @@ Status SliceImplEx(const size_t element_size, void* output_data, const size_t N); -Status SliceImpl(const size_t element_size, +Status SliceImpl(cudaStream_t stream, + const size_t element_size, const int32_t dimension_count, const TArray& starts, const TArray& steps, @@ -29,7 +31,8 @@ Status SliceImpl(const size_t element_size, void* output_data, const size_t N); -Status SliceImplGrad(const size_t element_size, +Status SliceImplGrad(cudaStream_t stream, + const size_t element_size, const int32_t dimension_count, const TArray& starts, const TArray& steps, diff --git a/onnxruntime/core/providers/cuda/tensor/split.cc b/onnxruntime/core/providers/cuda/tensor/split.cc index 539f8016a6..708e3c0cfd 100644 --- a/onnxruntime/core/providers/cuda/tensor/split.cc +++ b/onnxruntime/core/providers/cuda/tensor/split.cc @@ -103,7 +103,8 @@ Status Split::ComputeInternal(OpKernelContext* ctx) const { axis_dimension_input_output_mapping_gpu.CopyToGpu(); size_t element_size = input_tensor->DataType()->Size(); - ORT_RETURN_IF_ERROR(SplitImpl(element_size, + ORT_RETURN_IF_ERROR(SplitImpl(Stream(), + element_size, block_size_including_axis_dim, block_size_inside_axis_dim, split_sizes_gpu.GpuPtr(), diff --git a/onnxruntime/core/providers/cuda/tensor/split_impl.cu b/onnxruntime/core/providers/cuda/tensor/split_impl.cu index 2c87126aa1..f1565428d6 100644 --- a/onnxruntime/core/providers/cuda/tensor/split_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/split_impl.cu @@ -39,7 +39,8 @@ __global__ void _SplitKernel(const fast_divmod block_size_including_axis_dim_div reinterpret_cast(output_ptr[output_index])[output_pos] = input_data[id]; } -Status SplitImpl(const size_t element_size, +Status SplitImpl(cudaStream_t stream, + const size_t element_size, const int block_size_including_axis_dim, const int block_size_inside_axis_dim, const int64_t* split_sizes, @@ -56,7 +57,7 @@ Status SplitImpl(const size_t element_size, switch (element_size) { case sizeof(int8_t): - _SplitKernel<<>>( + _SplitKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs, reinterpret_cast::MappedType*>(input_data), @@ -64,7 +65,7 @@ Status SplitImpl(const size_t element_size, (CUDA_LONG)N); break; case sizeof(int16_t): - _SplitKernel<<>>( + _SplitKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs, reinterpret_cast::MappedType*>(input_data), @@ -72,7 +73,7 @@ Status SplitImpl(const size_t element_size, (CUDA_LONG)N); break; case sizeof(int32_t): - _SplitKernel<<>>( + _SplitKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs, reinterpret_cast::MappedType*>(input_data), @@ -80,7 +81,7 @@ Status SplitImpl(const size_t element_size, (CUDA_LONG)N); break; case sizeof(int64_t): - _SplitKernel<<>>( + _SplitKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs, reinterpret_cast::MappedType*>(input_data), diff --git a/onnxruntime/core/providers/cuda/tensor/split_impl.h b/onnxruntime/core/providers/cuda/tensor/split_impl.h index fa07a68fb5..a8fde02549 100644 --- a/onnxruntime/core/providers/cuda/tensor/split_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/split_impl.h @@ -9,7 +9,8 @@ namespace onnxruntime { namespace cuda { -Status SplitImpl(const size_t element_size, +Status SplitImpl(cudaStream_t stream, + const size_t element_size, const int block_size_including_axis_dim, const int block_size_inside_axis_dim, const int64_t* split_sizes, diff --git a/onnxruntime/core/providers/cuda/tensor/squeeze.cc b/onnxruntime/core/providers/cuda/tensor/squeeze.cc index b6cd7317ea..3cd5eab410 100644 --- a/onnxruntime/core/providers/cuda/tensor/squeeze.cc +++ b/onnxruntime/core/providers/cuda/tensor/squeeze.cc @@ -68,7 +68,7 @@ Status Squeeze::ComputeInternal(OpKernelContext* ctx) const { auto count = X->Shape().Size(); auto element_bytes = X->DataType()->Size(); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output, input, count * element_bytes, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output, input, count * element_bytes, cudaMemcpyDeviceToDevice, Stream())); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/tensor/tile.cc b/onnxruntime/core/providers/cuda/tensor/tile.cc index 7b68b19ce3..7d70baa3a9 100644 --- a/onnxruntime/core/providers/cuda/tensor/tile.cc +++ b/onnxruntime/core/providers/cuda/tensor/tile.cc @@ -72,7 +72,7 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { // Repeat tensor has all 1s in it if (output_shape == input_shape) { - cudaMemcpyAsync(output_tensor.MutableDataRaw(), input_tensor.DataRaw(), input_tensor.SizeInBytes(), cudaMemcpyDeviceToDevice); + cudaMemcpyAsync(output_tensor.MutableDataRaw(), input_tensor.DataRaw(), input_tensor.SizeInBytes(), cudaMemcpyDeviceToDevice, Stream()); return Status::OK(); } @@ -91,6 +91,7 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { if (input_tensor.IsDataType() || input_tensor.IsDataType()) { TileMemcpyImpl( + Stream(), reinterpret_cast::MappedType*>(input_data), input_shape.Size(), reinterpret_cast::MappedType*>(output_data), @@ -98,12 +99,14 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { } else if (input_tensor.IsDataType() || input_tensor.IsDataType()) { TileMemcpyImpl( + Stream(), reinterpret_cast::MappedType*>(input_data), input_shape.Size(), reinterpret_cast::MappedType*>(output_data), output_shape.Size()); } else if (input_tensor.IsDataType()) { TileMemcpyImpl( + Stream(), reinterpret_cast::MappedType*>(input_data), input_shape.Size(), reinterpret_cast::MappedType*>(output_data), @@ -116,6 +119,7 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { if (input_tensor.IsDataType() || input_tensor.IsDataType()) { TileBatchedMemcpyImpl( + Stream(), reinterpret_cast::MappedType*>(input_data), num_of_elements_per_batch, input_shape[0], // The tensor is atleast 1-D- this is safe @@ -125,6 +129,7 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { } else if (input_tensor.IsDataType() || input_tensor.IsDataType()) { TileBatchedMemcpyImpl( + Stream(), reinterpret_cast::MappedType*>(input_data), num_of_elements_per_batch, input_shape[0], // The tensor is atleast 1-D- this is safe @@ -133,6 +138,7 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { output_shape.Size()); } else if (input_tensor.IsDataType()) { TileBatchedMemcpyImpl( + Stream(), reinterpret_cast::MappedType*>(input_data), num_of_elements_per_batch, input_shape[0], // The tensor is atleast 1-D- this is safe @@ -169,6 +175,7 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { if (input_tensor.IsDataType() || input_tensor.IsDataType()) { TileImpl( + Stream(), rank, fdm_input_shape, input_strides, @@ -179,6 +186,7 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { } else if (input_tensor.IsDataType() || input_tensor.IsDataType()) { TileImpl( + Stream(), rank, fdm_input_shape, input_strides, @@ -188,6 +196,7 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { output_tensor.Shape().Size()); } else if (input_tensor.IsDataType()) { TileImpl( + Stream(), rank, fdm_input_shape, input_strides, diff --git a/onnxruntime/core/providers/cuda/tensor/tile_impl.cu b/onnxruntime/core/providers/cuda/tensor/tile_impl.cu index a66db85a2f..d5b6cc931e 100644 --- a/onnxruntime/core/providers/cuda/tensor/tile_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/tile_impl.cu @@ -32,6 +32,7 @@ __global__ void _TileKernel( template void TileImpl( + cudaStream_t stream, const size_t shape_rank, const TArray& fdm_input_shape, const TArray& input_stride, @@ -40,7 +41,7 @@ void TileImpl( T* output_data, const size_t N) { int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _TileKernel<<>>( + _TileKernel<<>>( shape_rank, fdm_input_shape, input_stride, input_data, fdm_output_strides, output_data, (CUDA_LONG)N); } @@ -58,12 +59,13 @@ __global__ void _TileMemcpyKernel( template void TileMemcpyImpl( + cudaStream_t stream, const T* input_data, const size_t num_input_elements, T* output_data, const size_t num_output_elements) { int blocksPerGrid = (int)(ceil(static_cast(num_output_elements) / GridDim::maxThreadsPerBlock)); - _TileMemcpyKernel<<>>( + _TileMemcpyKernel<<>>( input_data, num_input_elements, output_data, (CUDA_LONG)num_output_elements); } @@ -84,6 +86,7 @@ __global__ void _TileBatchedMemcpyKernel( template void TileBatchedMemcpyImpl( + cudaStream_t stream, const T* input_data, const size_t num_of_elements_per_input_batch, const size_t num_input_batch_count, @@ -91,7 +94,7 @@ void TileBatchedMemcpyImpl( T* output_data, const size_t num_output_elements) { int blocksPerGrid = (int)(ceil(static_cast(num_output_elements) / GridDim::maxThreadsPerBlock)); - _TileBatchedMemcpyKernel<<>>( + _TileBatchedMemcpyKernel<<>>( input_data, num_of_elements_per_input_batch, num_input_batch_count, @@ -101,9 +104,9 @@ void TileBatchedMemcpyImpl( } #define SPECIALIZED_IMPL(T) \ - template void TileImpl(const size_t shape_rank, const TArray& fdm_input_shape, const TArray& input_stride, const T* input_data, const TArray& fdm_output_strides, T* output_data, const size_t N); \ - template void TileMemcpyImpl(const T* input_data, const size_t num_input_elements, T* output_data, const size_t num_output_elements); \ - template void TileBatchedMemcpyImpl(const T* input_data, const size_t num_of_elements_per_input_batch, const size_t num_input_batch_count, const fast_divmod& num_of_elements_per_output_batch, T* output_data, const size_t num_output_elements); + template void TileImpl(cudaStream_t stream, const size_t shape_rank, const TArray& fdm_input_shape, const TArray& input_stride, const T* input_data, const TArray& fdm_output_strides, T* output_data, const size_t N); \ + template void TileMemcpyImpl(cudaStream_t stream, const T* input_data, const size_t num_input_elements, T* output_data, const size_t num_output_elements); \ + template void TileBatchedMemcpyImpl(cudaStream_t stream, const T* input_data, const size_t num_of_elements_per_input_batch, const size_t num_input_batch_count, const fast_divmod& num_of_elements_per_output_batch, T* output_data, const size_t num_output_elements); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/core/providers/cuda/tensor/tile_impl.h b/onnxruntime/core/providers/cuda/tensor/tile_impl.h index 27404c8d39..a612beabcf 100644 --- a/onnxruntime/core/providers/cuda/tensor/tile_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/tile_impl.h @@ -10,6 +10,7 @@ namespace cuda { template void TileImpl( + cudaStream_t stream, const size_t shape_rank, const TArray& input_shape, const TArray& input_strides, @@ -20,6 +21,7 @@ void TileImpl( template void TileMemcpyImpl( + cudaStream_t stream, const T* input_data, const size_t num_input_elements, T* output_data, @@ -27,6 +29,7 @@ void TileMemcpyImpl( template void TileBatchedMemcpyImpl( + cudaStream_t stream, const T* input_data, const size_t num_of_elements_per_input_batch, const size_t num_input_batch_count, diff --git a/onnxruntime/core/providers/cuda/tensor/transpose.cc b/onnxruntime/core/providers/cuda/tensor/transpose.cc index 8a90b60cd9..f5fb7c0147 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose.cc +++ b/onnxruntime/core/providers/cuda/tensor/transpose.cc @@ -54,14 +54,15 @@ static std::tuple TryTransposeWithCublas(const std::vector& pe } template -Status TransposeWithCublas(cublasHandle_t cublas_handle, const Tensor& input, Tensor& output, int M, int N) { +Status TransposeWithCublas(cudaStream_t stream, cublasHandle_t cublas_handle, const Tensor& input, Tensor& output, int M, int N) { typedef typename ToCudaType::MappedType CudaT; CudaT one = ToCudaType::FromFloat(1.0f); CudaT zero = ToCudaType::FromFloat(0.0f); const CudaT* input_data = reinterpret_cast(input.Data()); CudaT* output_data = reinterpret_cast(output.MutableData()); CUBLAS_RETURN_IF_ERROR( - cublasTransposeHelper(cublas_handle, + cublasTransposeHelper(stream, + cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T, M, N, &one, input_data, @@ -76,10 +77,11 @@ Status TransposeWithCublas(cublasHandle_t cublas_handle, const Tensor& input, Te Status Transpose::DoTranspose(const Transpose& transpose_kernel, const std::vector& permutations, const Tensor& input, Tensor& output) { - return Transpose::DoTranspose(transpose_kernel.GetDeviceProp(), transpose_kernel.CublasHandle(), permutations, input, output); + return Transpose::DoTranspose(transpose_kernel.GetDeviceProp(), transpose_kernel.Stream(), transpose_kernel.CublasHandle(), permutations, input, output); } Status Transpose::DoTranspose(const cudaDeviceProp& prop, + cudaStream_t stream, const cublasHandle_t cublas_handle, const std::vector& permutations, const Tensor& input, Tensor& output, const TensorShape* input_shape_override) { @@ -96,11 +98,11 @@ Status Transpose::DoTranspose(const cudaDeviceProp& prop, int N = std::get<1>(mn); if (M != 0 && N != 0) { if (element_type == utils::GetONNXTensorElementDataType()) { - return TransposeWithCublas(cublas_handle, input, output, M, N); + return TransposeWithCublas(stream, cublas_handle, input, output, M, N); } else if (element_type == utils::GetONNXTensorElementDataType()) { - return TransposeWithCublas(cublas_handle, input, output, M, N); + return TransposeWithCublas(stream, cublas_handle, input, output, M, N); } else { - return TransposeWithCublas(cublas_handle, input, output, M, N); + return TransposeWithCublas(stream, cublas_handle, input, output, M, N); } } } @@ -162,14 +164,14 @@ Status Transpose::DoTranspose(const cudaDeviceProp& prop, size_t element_size = input.DataType()->Size(); if (CanDoTranspose3D(new_rank, new_input_dims, new_permutations)) { - return Transpose3DImpl(element_size, input_shape, tmp_input_strides, + return Transpose3DImpl(stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), output.MutableDataRaw(), output.Shape().Size()); } else if (CanDoTranspose4D(prop, element_size, new_rank, new_input_dims, new_permutations)) { TArray tmp_output_strides(new_rank); for (auto i = 0; i < new_rank; i++) { tmp_output_strides[i] = new_output_strides[new_permutations[i]]; } - return Transpose4DImpl(element_size, input_shape, tmp_input_strides, input.DataRaw(), + return Transpose4DImpl(stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), tmp_output_strides, output.MutableDataRaw(), gsl::narrow(output.Shape().Size())); } @@ -184,7 +186,7 @@ Status Transpose::DoTranspose(const cudaDeviceProp& prop, output_strides[i] = fast_divmod(gsl::narrow_cast(new_output_strides[i])); } - auto status = TransposeImpl(element_size, new_rank, input_strides, input.DataRaw(), + auto status = TransposeImpl(stream, element_size, new_rank, input_strides, input.DataRaw(), output_strides, output.MutableDataRaw(), gsl::narrow(output.Shape().Size())); return status; @@ -208,7 +210,7 @@ Status Transpose::ComputeInternal(OpKernelContext* ctx) const { TensorShape output_shape{output_dims}; Tensor* Y = ctx->Output(0, output_shape); - return DoTranspose(this->GetDeviceProp(), this->CublasHandle(), *p_perm, X, *Y); + return DoTranspose(this->GetDeviceProp(), this->Stream(), this->CublasHandle(), *p_perm, X, *Y); } } // namespace cuda diff --git a/onnxruntime/core/providers/cuda/tensor/transpose.h b/onnxruntime/core/providers/cuda/tensor/transpose.h index c9cd83e5a8..c9b41e9102 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose.h +++ b/onnxruntime/core/providers/cuda/tensor/transpose.h @@ -23,6 +23,7 @@ class Transpose final : public CudaKernel, public TransposeBase { // `input_shape_override` (if provided) overrides the shape of `input` for compute purposes static Status DoTranspose(const cudaDeviceProp& prop, + cudaStream_t stream, const cublasHandle_t cublas_handle, const std::vector& permutations, const Tensor& input, Tensor& output, const TensorShape* input_shape_override = nullptr); diff --git a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu index 3e7b860fec..10611c9cd9 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu @@ -40,7 +40,7 @@ bool CanDoTranspose3D(int32_t rank, return false; } -Status Transpose3DImpl(size_t element_size, +Status Transpose3DImpl(cudaStream_t stream, size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, void* output_data, int64_t N) { dim3 block_size(TILE_DIM, TILE_DIM); @@ -48,25 +48,25 @@ Status Transpose3DImpl(size_t element_size, switch (element_size) { case sizeof(int8_t): - Transpose3DKernel<<>>( + Transpose3DKernel<<>>( input_shape, input_strides, reinterpret_cast::MappedType*>(input_data), reinterpret_cast::MappedType*>(output_data)); break; case sizeof(int16_t): - Transpose3DKernel<<>>( + Transpose3DKernel<<>>( input_shape, input_strides, reinterpret_cast::MappedType*>(input_data), reinterpret_cast::MappedType*>(output_data)); break; case sizeof(int32_t): - Transpose3DKernel<<>>( + Transpose3DKernel<<>>( input_shape, input_strides, reinterpret_cast::MappedType*>(input_data), reinterpret_cast::MappedType*>(output_data)); break; case sizeof(int64_t): - Transpose3DKernel<<>>( + Transpose3DKernel<<>>( input_shape, input_strides, reinterpret_cast::MappedType*>(input_data), reinterpret_cast::MappedType*>(output_data)); @@ -129,7 +129,7 @@ bool CanDoTranspose4D(const cudaDeviceProp& prop, return false; } -Status Transpose4DImpl(size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, +Status Transpose4DImpl(cudaStream_t stream, size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, const TArray& output_strides, void* output_data, int N) { unsigned int num_elements_per_thread = 4 * sizeof(int) / static_cast(element_size); // int4 is used in the kernel to access data. dim3 block_size(static_cast(input_shape[3] / num_elements_per_thread), static_cast(input_shape[2])); @@ -137,22 +137,22 @@ Status Transpose4DImpl(size_t element_size, const TArray& input_shape, switch (element_size) { case sizeof(int8_t): - Transpose4DKernel<<>>( + Transpose4DKernel<<>>( input_strides, input_data, output_strides, output_data, N / num_elements_per_thread); break; case sizeof(int16_t): - Transpose4DKernel<<>>( + Transpose4DKernel<<>>( input_strides, input_data, output_strides, output_data, N / num_elements_per_thread); break; case sizeof(int32_t): - Transpose4DKernel<<>>( + Transpose4DKernel<<>>( input_strides, input_data, output_strides, output_data, N / num_elements_per_thread); break; case sizeof(int64_t): - Transpose4DKernel<<>>( + Transpose4DKernel<<>>( input_strides, input_data, output_strides, output_data, N / num_elements_per_thread); break; @@ -184,12 +184,12 @@ __global__ void TransposeKernel(int32_t shape_rank, const TArray input_ output_data[id] = input_data[input_index]; } -Status TransposeImpl(size_t element_size, int32_t shape_rank, const TArray& input_strides, +Status TransposeImpl(cudaStream_t stream, size_t element_size, int32_t shape_rank, const TArray& input_strides, const void* input_data, const TArray& fdm_output_strides, void* output_data, int N) { int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); switch (element_size) { case sizeof(int8_t): - TransposeKernel<<>>( + TransposeKernel<<>>( shape_rank, input_strides, reinterpret_cast::MappedType*>(input_data), fdm_output_strides, @@ -197,7 +197,7 @@ Status TransposeImpl(size_t element_size, int32_t shape_rank, const TArray<<>>( + TransposeKernel<<>>( shape_rank, input_strides, reinterpret_cast::MappedType*>(input_data), fdm_output_strides, @@ -205,7 +205,7 @@ Status TransposeImpl(size_t element_size, int32_t shape_rank, const TArray<<>>( + TransposeKernel<<>>( shape_rank, input_strides, reinterpret_cast::MappedType*>(input_data), fdm_output_strides, @@ -213,7 +213,7 @@ Status TransposeImpl(size_t element_size, int32_t shape_rank, const TArray<<>>( + TransposeKernel<<>>( shape_rank, input_strides, reinterpret_cast::MappedType*>(input_data), fdm_output_strides, diff --git a/onnxruntime/core/providers/cuda/tensor/transpose_impl.h b/onnxruntime/core/providers/cuda/tensor/transpose_impl.h index 5ea7f6e8ce..1a4d469776 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/transpose_impl.h @@ -9,16 +9,16 @@ namespace onnxruntime { namespace cuda { bool CanDoTranspose3D(int32_t rank, const std::vector& input_dims, const std::vector& permutations); -Status Transpose3DImpl(size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, +Status Transpose3DImpl(cudaStream_t stream, size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, void* output_data, int64_t N); bool CanDoTranspose4D(const cudaDeviceProp& prop, size_t element_size, int32_t rank, const std::vector& input_dims, const std::vector& permutations); -Status Transpose4DImpl(size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, +Status Transpose4DImpl(cudaStream_t stream, size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, const TArray& output_strides, void* output_data, int N); -Status TransposeImpl(size_t element_size, int32_t shape_rank, const TArray& input_strides, +Status TransposeImpl(cudaStream_t stream, size_t element_size, int32_t shape_rank, const TArray& input_strides, const void* input_data, const TArray& fdm_output_strides, void* output_data, int N); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/unsqueeze.cc b/onnxruntime/core/providers/cuda/tensor/unsqueeze.cc index c528322d0e..9d293b8821 100644 --- a/onnxruntime/core/providers/cuda/tensor/unsqueeze.cc +++ b/onnxruntime/core/providers/cuda/tensor/unsqueeze.cc @@ -50,7 +50,7 @@ Status Unsqueeze::ComputeInternal(OpKernelContext* ctx) const { auto count = p.input_tensor->Shape().Size(); auto element_bytes = p.input_tensor->DataType()->Size(); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output, input, count * element_bytes, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output, input, count * element_bytes, cudaMemcpyDeviceToDevice, Stream())); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/tensor/upsample.cc b/onnxruntime/core/providers/cuda/tensor/upsample.cc index a7fa0a6a94..2fec80ab1f 100644 --- a/onnxruntime/core/providers/cuda/tensor/upsample.cc +++ b/onnxruntime/core/providers/cuda/tensor/upsample.cc @@ -87,7 +87,7 @@ Status Upsample::BaseCompute(OpKernelContext* context, size_t temp_buffer_size = CalcResizeBufferSize(mode_, output_dims); auto dims_mapping_buffer = GetScratchBuffer(temp_buffer_size); void* dims_mapping = reinterpret_cast(dims_mapping_buffer.get()); - ResizeImpl(mode_, (int)rank, input_shape, output_shape, + ResizeImpl(Stream(), mode_, (int)rank, input_shape, output_shape, input_strides, output_div_pitches, scales_vals, roi_vals, reinterpret_cast(X->template Data()), reinterpret_cast(Y->template MutableData()), @@ -102,7 +102,8 @@ Status Upsample::BaseCompute(OpKernelContext* context, scales_div[i] = fast_divmod(gsl::narrow_cast(ceil(scales[i]))); } - UpampleImpl(mode_, + UpampleImpl(Stream(), + mode_, rank, (UpsampleMode::LINEAR == mode_) ? (rank == 2 ? X_dims[0] : X_dims[2]) : 0, input_strides, diff --git a/onnxruntime/core/providers/cuda/tensor/upsample_impl.cu b/onnxruntime/core/providers/cuda/tensor/upsample_impl.cu index 7409e707cc..83d83ef9d8 100644 --- a/onnxruntime/core/providers/cuda/tensor/upsample_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/upsample_impl.cu @@ -149,7 +149,8 @@ __global__ void _UpampleBilinear2DInputKernel(const int64_t input_dim0, } template -void UpampleImpl(const onnxruntime::UpsampleMode upsample_mode, +void UpampleImpl(cudaStream_t stream, + const onnxruntime::UpsampleMode upsample_mode, const size_t rank, const int64_t input_dim2, const TArray& input_pitches, @@ -160,22 +161,23 @@ void UpampleImpl(const onnxruntime::UpsampleMode upsample_mode, const size_t N) { int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); if (onnxruntime::UpsampleMode::NN == upsample_mode) { - _UpampleNearestKernel<<>>( + _UpampleNearestKernel<<>>( rank, input_pitches, output_div_pitches, scales_div, input_data, output_data, N); } else if (onnxruntime::UpsampleMode::LINEAR == upsample_mode && rank == 4) { - _UpampleBilinear4DInputKernel<<>>( + _UpampleBilinear4DInputKernel<<>>( input_dim2, input_pitches, output_div_pitches, scales_div, input_data, output_data, N); } else if (onnxruntime::UpsampleMode::LINEAR == upsample_mode && rank == 2) { - _UpampleBilinear2DInputKernel<<>>( + _UpampleBilinear2DInputKernel<<>>( input_dim2, input_pitches, output_div_pitches, scales_div, input_data, output_data, N); } } #define SPECIALIZED_IMPL(T) \ - template void UpampleImpl(const onnxruntime::UpsampleMode upsample_mode, \ + template void UpampleImpl(cudaStream_t stream, \ + const onnxruntime::UpsampleMode upsample_mode, \ const size_t rank, \ const int64_t input_dim2, \ const TArray& input_pitches, \ diff --git a/onnxruntime/core/providers/cuda/tensor/upsample_impl.h b/onnxruntime/core/providers/cuda/tensor/upsample_impl.h index a431f5d61c..32376c198d 100644 --- a/onnxruntime/core/providers/cuda/tensor/upsample_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/upsample_impl.h @@ -11,7 +11,8 @@ namespace onnxruntime { namespace cuda { template -void UpampleImpl(const onnxruntime::UpsampleMode upsample_mode, +void UpampleImpl(cudaStream_t stream, + const onnxruntime::UpsampleMode upsample_mode, const size_t rank, const int64_t input_dim2, const TArray& input_pitches, diff --git a/onnxruntime/core/providers/cuda/tensor/where.cc b/onnxruntime/core/providers/cuda/tensor/where.cc index 2b765789f8..ba85c2cd4c 100644 --- a/onnxruntime/core/providers/cuda/tensor/where.cc +++ b/onnxruntime/core/providers/cuda/tensor/where.cc @@ -174,6 +174,7 @@ Status Where::ComputeInternal(OpKernelContext* context) const { ORT_RETURN_IF_ERROR(prepare.TernaryElementwiseBroadcastPrepareHelper(condition_shape, X_shape, Y_shape, output_shape)); WhereImpl( + Stream(), prepare.output_rank_or_simple_broadcast, prepare.a_index_type, prepare.a_padded_strides, diff --git a/onnxruntime/core/providers/cuda/tensor/where_impl.cu b/onnxruntime/core/providers/cuda/tensor/where_impl.cu index 319007c359..0f2d4a4543 100644 --- a/onnxruntime/core/providers/cuda/tensor/where_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/where_impl.cu @@ -119,7 +119,7 @@ __global__ void _TenaryElementWiseSimple( Y_INDEX_TYPE, \ GridDim::maxThreadsPerBlock, \ GridDim::maxElementsPerThread> \ - <<>>(cond_data, \ + <<>>(cond_data, \ x_data, \ y_data, \ output_data, \ @@ -150,7 +150,7 @@ __global__ void _TenaryElementWiseSimple( Y_INDEX_TYPE, \ GridDim::maxThreadsPerBlock, \ GridDim::maxElementsPerThread> \ - <<>>(output_rank_or_simple_broadcast, \ + <<>>(output_rank_or_simple_broadcast, \ cond_padded_strides, \ cond_data, \ x_padded_strides, \ @@ -182,6 +182,7 @@ __global__ void _TenaryElementWiseSimple( template void WhereImpl( + cudaStream_t stream, size_t output_rank_or_simple_broadcast, BroadcastIndexType cond_index_type, const TArray& cond_padded_strides, @@ -212,7 +213,8 @@ void WhereImpl( } #define SPECIALIZED_IMPL(T) \ - template void WhereImpl(size_t output_rank_or_simple_broadcast, \ + template void WhereImpl(cudaStream_t stream, \ + size_t output_rank_or_simple_broadcast, \ BroadcastIndexType cond_index_type, \ const TArray& cond_padded_strides, \ const bool* cond_data, \ diff --git a/onnxruntime/core/providers/cuda/tensor/where_impl.h b/onnxruntime/core/providers/cuda/tensor/where_impl.h index 24cf54f351..e560c4717c 100644 --- a/onnxruntime/core/providers/cuda/tensor/where_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/where_impl.h @@ -11,6 +11,7 @@ namespace cuda { template void WhereImpl( + cudaStream_t stream, size_t output_rank_or_simple_broadcast, BroadcastIndexType cond_index_type, const TArray& cond_padded_strides, diff --git a/onnxruntime/core/providers/rocm/fpgeneric.cu b/onnxruntime/core/providers/rocm/fpgeneric.cu index 072bd17cff..c53934c688 100644 --- a/onnxruntime/core/providers/rocm/fpgeneric.cu +++ b/onnxruntime/core/providers/rocm/fpgeneric.cu @@ -46,21 +46,21 @@ __global__ void CopyVectorHalf(const half* x, int incx, half* y, int incy, int n } // namespace -rocblas_status rocblasTransposeHelper(rocblas_handle, rocblas_operation , rocblas_operation , int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int) { +rocblas_status rocblasTransposeHelper(hipStream_t stream, rocblas_handle, rocblas_operation , rocblas_operation , int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int) { if (C != A) { dim3 dimGrid((n + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, (m + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, 1); dim3 dimBlock(TRANS_TILE_DIM, BLOCK_ROWS, 1); - hipLaunchKernelGGL(transposeNoOverlap, dim3(dimGrid), dim3(dimBlock), 0, 0, C, A, n, m); + hipLaunchKernelGGL(transposeNoOverlap, dim3(dimGrid), dim3(dimBlock), 0, stream, C, A, n, m); } else { return rocblas_status_not_implemented; } return rocblas_status_success; } -rocblas_status rocblasCopyHelper(rocblas_handle, int n, const half* x, int incx, half* y, int incy) { +rocblas_status rocblasCopyHelper(hipStream_t stream, rocblas_handle, int n, const half* x, int incx, half* y, int incy) { dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1); dim3 dimBlock(COPY_BLOCK_DIM, 1, 1); - hipLaunchKernelGGL(CopyVectorHalf, dim3(dimGrid), dim3(dimBlock), 0, 0, x, incx, y, incy, n); + hipLaunchKernelGGL(CopyVectorHalf, dim3(dimGrid), dim3(dimBlock), 0, stream, x, incx, y, incy, n); return rocblas_status_success; } diff --git a/onnxruntime/core/providers/rocm/gpu_data_transfer.cc b/onnxruntime/core/providers/rocm/gpu_data_transfer.cc index 23111395f0..83986e4819 100644 --- a/onnxruntime/core/providers/rocm/gpu_data_transfer.cc +++ b/onnxruntime/core/providers/rocm/gpu_data_transfer.cc @@ -5,16 +5,25 @@ #include "rocm_common.h" namespace onnxruntime { -GPUDataTransfer::GPUDataTransfer() { - // create streams, default is nullptr - streams_[kHipStreamDefault] = nullptr; - HIP_CALL_THROW(hipStreamCreateWithFlags(&streams_[kHipStreamCopyIn], hipStreamNonBlocking)); - HIP_CALL_THROW(hipStreamCreateWithFlags(&streams_[kHipStreamCopyOut], hipStreamNonBlocking)); +GPUDataTransfer::GPUDataTransfer(hipStream_t stream, bool do_copy_in_default_stream) { + do_copy_in_default_stream_ = do_copy_in_default_stream; + streams_[kHipStreamDefault] = stream; + if (do_copy_in_default_stream) { + streams_[kHipStreamCopyIn] = stream; + streams_[kHipStreamCopyOut] = stream; + } else { + HIP_CALL_THROW(hipStreamCreateWithFlags(&streams_[kHipStreamCopyIn], hipStreamNonBlocking)); + HIP_CALL_THROW(hipStreamCreateWithFlags(&streams_[kHipStreamCopyOut], hipStreamNonBlocking)); + } } GPUDataTransfer::~GPUDataTransfer() { - HIP_CALL(hipStreamDestroy(streams_[kHipStreamCopyIn])); - HIP_CALL(hipStreamDestroy(streams_[kHipStreamCopyOut])); + if (!do_copy_in_default_stream_ && streams_[kHipStreamCopyIn] != nullptr) { + HIP_CALL(hipStreamDestroy(streams_[kHipStreamCopyIn])); + } + if (!do_copy_in_default_stream_ && streams_[kHipStreamCopyOut] != nullptr) { + HIP_CALL(hipStreamDestroy(streams_[kHipStreamCopyOut])); + } } bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const { @@ -33,24 +42,26 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int e if (dst_device.Type() == OrtDevice::GPU) { if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) { // copy from pinned memory to GPU, this is non-blocking - HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, streams_[exec_queue_id])); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, GetStream(exec_queue_id))); } else if (src_device.Type() == OrtDevice::GPU) { // copying between GPU, this is non-blocking // Copy only if the two addresses are different. if (dst_data != src_data) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToDevice, streams_[kHipStreamDefault])); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToDevice, GetStream(kHipStreamDefault))); } } else { // copy from other CPU memory to GPU, this is blocking - HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, GetStream(kHipStreamDefault))); + HIP_RETURN_IF_ERROR(hipStreamSynchronize(GetStream(kHipStreamDefault))); } } else if (src_device.Type() == OrtDevice::GPU) { if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED) { // copying from GPU to pinned memory, this is non-blocking - HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, streams_[exec_queue_id])); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, GetStream(exec_queue_id))); } else { // copying from GPU to CPU memory, this is blocking - HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, GetStream(kHipStreamDefault))); + HIP_RETURN_IF_ERROR(hipStreamSynchronize(GetStream(kHipStreamDefault))); } } else { // copying between cpu memory diff --git a/onnxruntime/core/providers/rocm/gpu_data_transfer.h b/onnxruntime/core/providers/rocm/gpu_data_transfer.h index 8001fa48bc..9c07968d19 100644 --- a/onnxruntime/core/providers/rocm/gpu_data_transfer.h +++ b/onnxruntime/core/providers/rocm/gpu_data_transfer.h @@ -17,7 +17,7 @@ enum HIPStreamType : int { class GPUDataTransfer : public IDataTransfer { public: - GPUDataTransfer(); + GPUDataTransfer(hipStream_t stream, bool do_copy_in_default_stream = true); ~GPUDataTransfer(); bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override; @@ -32,6 +32,7 @@ class GPUDataTransfer : public IDataTransfer { } private: + bool do_copy_in_default_stream_; hipStream_t streams_[kTotalHipStreams]; }; diff --git a/onnxruntime/core/providers/rocm/math/gemm.cc b/onnxruntime/core/providers/rocm/math/gemm.cc index 413744d595..b571e7930f 100644 --- a/onnxruntime/core/providers/rocm/math/gemm.cc +++ b/onnxruntime/core/providers/rocm/math/gemm.cc @@ -83,6 +83,7 @@ Status Gemm::ComputeInternal(OpKernelContext* ctx) const { if (b_shape.Size() == 1) { // if B is (), (1,) or (1, 1), broadcast the scalar ROCBLAS_RETURN_IF_ERROR(rocblasCopyHelper( + Stream(), RocblasHandle(), M * N, b_data, @@ -115,7 +116,7 @@ Status Gemm::ComputeInternal(OpKernelContext* ctx) const { out_data, N)); } else { // B is (M, N), no broadcast needed. - HIP_RETURN_IF_ERROR(hipMemcpyAsync(out_data, b_data, M * N * sizeof(T), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(out_data, b_data, M * N * sizeof(T), hipMemcpyDeviceToDevice, Stream())); } } diff --git a/onnxruntime/core/providers/rocm/math/softmax.cc b/onnxruntime/core/providers/rocm/math/softmax.cc index 64fb4cb53c..5d66c742e6 100644 --- a/onnxruntime/core/providers/rocm/math/softmax.cc +++ b/onnxruntime/core/providers/rocm/math/softmax.cc @@ -12,6 +12,7 @@ namespace rocm { template Status SoftMaxComputeHelper( + hipStream_t stream, const T* X, const TensorShape& input_shape, T* Y, @@ -29,7 +30,7 @@ Status SoftMaxComputeHelper( // miopenSoftmaxForward/Backward is not optimal implementation. // TODO: remove miopen path completely in the future. if (D <= 1024 && D * sizeof(T) <= 4096) { - dispatch_softmax_forward, is_log_softmax>(Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); + dispatch_softmax_forward, is_log_softmax>(stream, Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); return Status::OK(); } @@ -51,8 +52,8 @@ Status SoftMaxComputeHelper( } #define SPECIALIZED_SOFTMAX_HELPER_IMPL(T) \ - template Status SoftMaxComputeHelper(const T* input, const TensorShape& shape, T* Y, miopenHandle_t handle, int64_t axis); \ - template Status SoftMaxComputeHelper(const T* input, const TensorShape& shape, T* Y, miopenHandle_t handle, int64_t axis); + template Status SoftMaxComputeHelper(hipStream_t stream, const T* input, const TensorShape& shape, T* Y, miopenHandle_t handle, int64_t axis); \ + template Status SoftMaxComputeHelper(hipStream_t stream, const T* input, const TensorShape& shape, T* Y, miopenHandle_t handle, int64_t axis); SPECIALIZED_SOFTMAX_HELPER_IMPL(float) // SPECIALIZED_SOFTMAX_HELPER_IMPL(double) @@ -119,9 +120,9 @@ Status Softmax::ComputeInternal(OpKernelContext* ctx) const { return Status::OK(); if (log_softmax_) { - return SoftMaxComputeHelper(X_data, input_shape, Y_data, MiopenHandle(), axis_); + return SoftMaxComputeHelper(Stream(), X_data, input_shape, Y_data, MiopenHandle(), axis_); } else { - return SoftMaxComputeHelper(X_data, input_shape, Y_data, MiopenHandle(), axis_); + return SoftMaxComputeHelper(Stream(), X_data, input_shape, Y_data, MiopenHandle(), axis_); } } diff --git a/onnxruntime/core/providers/rocm/math/softmax_impl.cu b/onnxruntime/core/providers/rocm/math/softmax_impl.cu index 94f8e4fc54..2b079949e0 100644 --- a/onnxruntime/core/providers/rocm/math/softmax_impl.cu +++ b/onnxruntime/core/providers/rocm/math/softmax_impl.cu @@ -136,7 +136,7 @@ __global__ void softmax_warp_forward(output_t* dst, const input_t* src, int batc } template -void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count) { +void dispatch_softmax_forward(hipStream_t stream, output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count) { if (softmax_elements == 0) { return; } else { @@ -160,37 +160,37 @@ void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_ele // Launch code would be more elegant if C++ supported FOR CONSTEXPR switch (log2_elements) { case 0: // 1 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 1: // 2 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 2: // 4 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 3: // 8 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 4: // 16 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 5: // 32 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 6: // 64 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 7: // 128 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 8: // 256 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 9: // 512 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 10: // 1024 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; default: break; @@ -199,8 +199,8 @@ void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_ele } #define SPECIALIZED_SOFTMAX_IMPL(input_t, output_t, acc_t) \ -template void dispatch_softmax_forward(output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); \ -template void dispatch_softmax_forward(output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); +template void dispatch_softmax_forward(hipStream_t stream, output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); \ +template void dispatch_softmax_forward(hipStream_t stream, output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); SPECIALIZED_SOFTMAX_IMPL(float, float, float) SPECIALIZED_SOFTMAX_IMPL(half, half, float) diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc index 5f177e1ecc..55aac97d7a 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc @@ -149,6 +149,7 @@ Status ReduceKernel::ReduceKernelShared( switch (applicable_matrix_reduction) { case ApplicableMatrixReduction::Rows: { return reduce_matrix_rows( + Stream(), reinterpret_cast(X), reinterpret_cast(Y), m, n, false); @@ -167,7 +168,7 @@ Status ReduceKernel::ReduceKernelShared( // ArgMax/ArgMin with FP16 are not supported by miopen, so convert input to fp32 then call miopen temp_X = GetScratchBuffer(input_count); miopen_type_X = miopenFloat; - Impl_Cast(reinterpret_cast(X), temp_X.get(), input_shape.Size()); + Impl_Cast(Stream(), reinterpret_cast(X), temp_X.get(), input_shape.Size()); } // MIOpen requires at least 3D input, so pad 1s if needed @@ -208,7 +209,8 @@ Status ReduceKernel::ReduceKernelShared( input_data_buffer = GetScratchBuffer(input_count); input_data = reinterpret_cast(input_data_buffer.get()); fast_divmod tmp_div; - Impl_Mul(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + Impl_Mul(Stream(), + static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(X), nullptr, reinterpret_cast(X), nullptr, tmp_div, tmp_div, @@ -233,7 +235,8 @@ Status ReduceKernel::ReduceKernelShared( auto log_sum_result = log_sum_result_buffer.get(); BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(prepare.BinaryElementwiseBroadcastPrepareHelper(input_shape, rhs_shape, input_shape)); - Impl_Sub(prepare.output_rank_or_simple_broadcast, + Impl_Sub(Stream(), + prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast(X), &prepare.rhs_padded_strides, @@ -242,7 +245,8 @@ Status ReduceKernel::ReduceKernelShared( prepare.fdm_H, prepare.fdm_C, reinterpret_cast(exp_result), input_count); - Impl_Exp(reinterpret_cast(exp_result), + Impl_Exp(Stream(), + reinterpret_cast(exp_result), reinterpret_cast(exp_result), input_count); @@ -253,13 +257,15 @@ Status ReduceKernel::ReduceKernelShared( &zero, output_tensor, reinterpret_cast(log_sum_result))); // Log(Sum) - Impl_Log(reinterpret_cast(log_sum_result), + Impl_Log(Stream(), + reinterpret_cast(log_sum_result), reinterpret_cast(log_sum_result), output_count); // Log + ReduceMax fast_divmod tmp_div; - Impl_Add(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + Impl_Add(Stream(), + static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(log_sum_result), nullptr, reinterpret_cast(Y), nullptr, tmp_div, tmp_div, @@ -276,7 +282,7 @@ Status ReduceKernel::ReduceKernelShared( // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case if (input_count == output_count) { if (reinterpret_cast(Y) != reinterpret_cast(X)) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y, X, input_count * sizeof(T), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y, X, input_count * sizeof(T), hipMemcpyDeviceToDevice, Stream())); } } else { MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( @@ -301,11 +307,12 @@ Status ReduceKernel::ReduceKernelShared( } // MIOpen reduction index is uint32_t for now, cast it to int64_t according to ONNX spec - Impl_Cast(reinterpret_cast(indices_rocm.get()), reinterpret_cast(Y), output_count); + Impl_Cast(Stream(), reinterpret_cast(indices_rocm.get()), reinterpret_cast(Y), output_count); } if (calculate_log_) { - Impl_Log(reinterpret_cast(Y), + Impl_Log(Stream(), + reinterpret_cast(Y), reinterpret_cast(Y), output_count); } @@ -421,7 +428,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr std::vector& output_dims = prepare_reduce_metadata.output_dims; std::vector& input_dims_miopen = prepare_reduce_metadata.input_dims_miopen; std::vector& output_dims_miopen = prepare_reduce_metadata.output_dims_miopen; - + hipStream_t stream = static_cast(rocm_ep.GetComputeStream()); // special case when there is a dim value of 0 in the shape. if (input_count == 0) { assert(output.Shape().Size() == 0); @@ -436,6 +443,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr switch (applicable_matrix_reduction) { case ApplicableMatrixReduction::Rows: { return reduce_matrix_rows( + stream, reinterpret_cast(input.template Data()), reinterpret_cast(output.template MutableData()), m, n); @@ -444,6 +452,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr const auto buffer_size_bytes = compute_reduce_matrix_columns_buffer_size(m, n); auto buffer = rocm_ep.GetScratchBuffer(buffer_size_bytes); return reduce_matrix_columns( + stream, reinterpret_cast(input.template Data()), reinterpret_cast(output.template MutableData()), m, n, buffer.get(), buffer_size_bytes); @@ -455,7 +464,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. // Therefore zeroing out the memory is required - HIP_RETURN_IF_ERROR(hipMemsetAsync(output.MutableDataRaw(), 0, output.SizeInBytes())); + HIP_RETURN_IF_ERROR(hipMemsetAsync(output.MutableDataRaw(), 0, output.SizeInBytes(), stream)); IAllocatorUniquePtr temp_X; miopenDataType_t miopen_type_X = MiopenTensor::GetDataType(); @@ -464,7 +473,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr // ArgMax/ArgMin with FP16 are not supported by miopen, so convert input to fp32 then call miopen temp_X = rocm_ep.GetScratchBuffer(input_count); miopen_type_X = miopenFloat; - Impl_Cast(reinterpret_cast(input.template Data()), temp_X.get(), input_shape.Size()); + Impl_Cast(stream, reinterpret_cast(input.template Data()), temp_X.get(), input_shape.Size()); } MiopenReduceDescriptor reduce_desc; @@ -497,7 +506,8 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr input_data_buffer = rocm_ep.GetScratchBuffer(input_count); input_data = reinterpret_cast(input_data_buffer.get()); fast_divmod tmp_div; - Impl_Mul(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + Impl_Mul(stream, + static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(input.template Data()), nullptr, reinterpret_cast(input.template Data()), nullptr, tmp_div, tmp_div, @@ -507,7 +517,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr // This happens when the input is Scalar if (input_count == output_count) { if (output.template MutableData() != input.template Data()) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), hipMemcpyDeviceToDevice, stream)); } } else { // Reduce max -- Max/Min will output indices data @@ -536,7 +546,8 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr auto log_sum_result = log_sum_result_buffer.get(); BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(prepare.BinaryElementwiseBroadcastPrepareHelper(input_shape, output_shape, input_shape)); - Impl_Sub(prepare.output_rank_or_simple_broadcast, + Impl_Sub(stream, + prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast(input.template Data()), &prepare.rhs_padded_strides, @@ -545,14 +556,15 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr prepare.fdm_H, prepare.fdm_C, reinterpret_cast(exp_result), input_count); - Impl_Exp(reinterpret_cast(exp_result), + Impl_Exp(stream, + reinterpret_cast(exp_result), reinterpret_cast(exp_result), input_count); // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case // This happens when the input is Scalar. We do not need to add anything in this case. if (input_count == output_count) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(reinterpret_cast(log_sum_result), exp_result, input_count * sizeof(T), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(reinterpret_cast(log_sum_result), exp_result, input_count * sizeof(T), hipMemcpyDeviceToDevice, stream)); } else { // ReduceSum MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( @@ -563,13 +575,15 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr } // Log(Sum) - Impl_Log(reinterpret_cast(log_sum_result), + Impl_Log(stream, + reinterpret_cast(log_sum_result), reinterpret_cast(log_sum_result), output_count); // Log + ReduceMax fast_divmod tmp_div; - Impl_Add(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + Impl_Add(stream, + static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(log_sum_result), nullptr, reinterpret_cast(output.template MutableData()), nullptr, tmp_div, tmp_div, @@ -581,7 +595,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case // This happens when the input is Scalar. We do not need to add anything in this case. if (input_count == output_count) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(reinterpret_cast(output.template MutableData()), input_data, input_count * sizeof(T), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(reinterpret_cast(output.template MutableData()), input_data, input_count * sizeof(T), hipMemcpyDeviceToDevice, stream)); } else { MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( rocm_ep.PerThreadMiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes, @@ -593,7 +607,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case if (input_count == output_count) { if (output.template MutableData() != input.template Data()) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), hipMemcpyDeviceToDevice, stream)); } } else { MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( @@ -607,7 +621,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr // miopenReduceTensor has issue if input and output has same size, which will happen if the axis to be reduced has dim value of 1. // the output is zeros of the output size if (input_count == output_count) { - HIP_RETURN_IF_ERROR(hipMemsetAsync(output.template MutableData(), static_cast(0), output_count * sizeof(int64_t))); + HIP_RETURN_IF_ERROR(hipMemsetAsync(output.template MutableData(), static_cast(0), output_count * sizeof(int64_t), stream)); } else { if (temp_X) { auto temp_output = rocm_ep.GetScratchBuffer(output_count); @@ -626,12 +640,13 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr } // MIOpen reduction index is uint32_t for now, cast it to int64_t according to ONNX spec - Impl_Cast(reinterpret_cast(indices_rocm.get()), output.template MutableData(), output_count); + Impl_Cast(stream, reinterpret_cast(indices_rocm.get()), output.template MutableData(), output_count); } } if (calculate_log) { - Impl_Log(reinterpret_cast(output.template MutableData()), + Impl_Log(stream, + reinterpret_cast(output.template MutableData()), reinterpret_cast(output.template MutableData()), output_count); } @@ -661,7 +676,7 @@ Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, miopenR // empty axes and no-op if (axes.empty() && noop_with_empty_axes_) { auto* Y = ctx->Output(0, X->Shape()); - HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -701,7 +716,7 @@ Status ReduceKernel::ComputeImpl // empty axes and no-op if (axes.empty() && noop_with_empty_axes_) { auto* Y = ctx->Output(0, X->Shape()); - HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -728,14 +743,14 @@ Status ReduceKernel::ComputeImpl // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case if (input_count == output_count) { if (Y->template MutableData() != X->template Data()) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), input_count * sizeof(int32_t), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), input_count * sizeof(int32_t), hipMemcpyDeviceToDevice, Stream())); } return Status::OK(); } // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. // Therefore zeroing out the memory is required - HIP_RETURN_IF_ERROR(hipMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes())); + HIP_RETURN_IF_ERROR(hipMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes(), Stream())); size_t indices_bytes = 0; size_t workspace_bytes = 0; @@ -745,7 +760,7 @@ Status ReduceKernel::ComputeImpl miopenDataType_t miopen_type_X = miopenFloat; IAllocatorUniquePtr temp_X = GetScratchBuffer(input_count); - Impl_Cast(reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); + Impl_Cast(Stream(), reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES)); ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X)); @@ -771,7 +786,7 @@ Status ReduceKernel::ComputeImpl output_tensor, temp_Y.get())); - Impl_Cast(temp_Y.get(), Y->template MutableData(), output_count); + Impl_Cast(Stream(), temp_Y.get(), Y->template MutableData(), output_count); return Status::OK(); } @@ -807,14 +822,14 @@ Status ReduceKernel::ComputeImpl( const auto* const src = X->template Data(); if (input_count == output_count) { if (src != dst) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst, src, input_count * sizeof(int8_t), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst, src, input_count * sizeof(int8_t), hipMemcpyDeviceToDevice, Stream())); } return Status::OK(); } // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. // Therefore zeroing out the memory is required - HIP_RETURN_IF_ERROR(hipMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes())); + HIP_RETURN_IF_ERROR(hipMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes(), Stream())); size_t indices_bytes = 0; size_t workspace_bytes = 0; @@ -824,7 +839,7 @@ Status ReduceKernel::ComputeImpl( miopenDataType_t miopen_type_X = miopenFloat; IAllocatorUniquePtr temp_X = GetScratchBuffer(input_count); - Impl_Cast(reinterpret_cast(src), temp_X.get(), X->Shape().Size()); + Impl_Cast(Stream(), reinterpret_cast(src), temp_X.get(), X->Shape().Size()); ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES)); ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X)); @@ -850,7 +865,7 @@ Status ReduceKernel::ComputeImpl( output_tensor, temp_Y.get())); - Impl_Cast(temp_Y.get(), dst, output_count); + Impl_Cast(Stream(), temp_Y.get(), dst, output_count); return Status::OK(); } diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc index dbaa7c3a20..eccd3afc02 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc @@ -58,10 +58,14 @@ ONNX_OPERATOR_KERNEL_EX( } // namespace rocm -ROCMExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, size_t hip_mem_limit, ArenaExtendStrategy arena_extend_strategy) { +ROCMExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, hipStream_t stream, size_t hip_mem_limit, ArenaExtendStrategy arena_extend_strategy) { HIP_CALL_THROW(hipSetDevice(device_id)); + stream_ = stream; + ROCBLAS_CALL_THROW(rocblas_create_handle(&rocblas_handle_)); + ROCBLAS_CALL_THROW(rocblas_set_stream(rocblas_handle_, stream)); MIOPEN_CALL_THROW(miopenCreate(&miopen_handle_)); + MIOPEN_CALL_THROW(miopenSetStream(miopen_handle_, stream)); AllocatorCreationInfo default_memory_info( [](OrtDevice::DeviceId id) { @@ -104,6 +108,16 @@ ROCMExecutionProvider::ROCMExecutionProvider(const ROCMExecutionProviderInfo& in HIP_CALL_THROW(hipDeviceSynchronize()); HIP_CALL_THROW(hipGetDeviceProperties(&device_prop_, info_.device_id)); + if (info.has_user_compute_stream) { + external_stream_ = true; + stream_ = static_cast(info.user_compute_stream); + } else { + // HIP_CALL_THROW(hipStreamCreateWithFlags(&stream_, hipStreamNonBlocking)); + // TODO: use default stream now due to failures of FusedMatMulOpTest. + // Will check with AMD to verify whether ROCBlas can run with specified stream. + stream_ = nullptr; + } + size_t free = 0; size_t total = 0; HIP_CALL_THROW(hipMemGetInfo(&free, &total)); @@ -168,6 +182,10 @@ ROCMExecutionProvider::~ROCMExecutionProvider() { ORT_IGNORE_RETURN_VALUE(cache->erase(this)); } } + + if (!external_stream_ && stream_) { + HIP_CALL(hipStreamDestroy(stream_)); + } } ROCMExecutionProvider::PerThreadContext& ROCMExecutionProvider::GetPerThreadContext() const { @@ -188,7 +206,7 @@ ROCMExecutionProvider::PerThreadContext& ROCMExecutionProvider::GetPerThreadCont // get or create a context if (context_state_.retired_context_pool.empty()) { - context = std::make_shared(info_.device_id, info_.hip_mem_limit, info_.arena_extend_strategy); + context = std::make_shared(info_.device_id, static_cast(GetComputeStream()), info_.hip_mem_limit, info_.arena_extend_strategy); } else { context = context_state_.retired_context_pool.back(); context_state_.retired_context_pool.pop_back(); @@ -286,7 +304,8 @@ Status ROCMExecutionProvider::OnRunStart() { Status ROCMExecutionProvider::OnRunEnd() { // record deferred release event on default stream, and release per_thread_context auto current_deferred_release_event = GetPerThreadContext().GetCurrentDeferredReleaseEvent(); - HIP_RETURN_IF_ERROR(hipEventRecord(current_deferred_release_event, nullptr)); + HIP_RETURN_IF_ERROR(hipEventRecord(current_deferred_release_event, static_cast(GetComputeStream()))); + HIP_RETURN_IF_ERROR(hipStreamSynchronize(static_cast(GetComputeStream()))); ReleasePerThreadContext(); std::lock_guard lock(deferred_release_cpu_ptr_mutex_); deferred_release_cpu_ptr_[current_deferred_release_event].recorded = true; @@ -1710,7 +1729,7 @@ static bool CastNeedFallbackToCPU(const onnxruntime::Node& node) { } std::unique_ptr ROCMExecutionProvider::GetDataTransfer() const { - return onnxruntime::make_unique(); + return onnxruntime::make_unique(static_cast(GetComputeStream()), info_.do_copy_in_default_stream); } std::vector> diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.h b/onnxruntime/core/providers/rocm/rocm_execution_provider.h index 8859ec42f3..5360f93444 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.h +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.h @@ -15,6 +15,7 @@ #include "core/providers/rocm/rocm_execution_provider_info.h" #include "core/providers/rocm/rocm_pch.h" #include "core/providers/rocm/shared_inc/rocm_utils.h" +#include "core/providers/rocm/shared_inc/rocm_call.h" namespace onnxruntime { @@ -37,6 +38,20 @@ class ROCMExecutionProvider : public IExecutionProvider { return nullptr; } + Status SetComputeStream(void* stream) override { + if (stream != stream_) { + if (stream_) { + HIP_CALL(hipStreamDestroy(stream_)); + } + + external_stream_ = true; + stream_ = static_cast(stream); + } + return Status::OK(); + } + + void* GetComputeStream() const override { return static_cast(stream_); } + rocblas_handle PerThreadRocblasHandle() { return GetPerThreadContext().RocblasHandle(); } @@ -77,6 +92,8 @@ class ROCMExecutionProvider : public IExecutionProvider { private: ROCMExecutionProviderInfo info_; hipDeviceProp_t device_prop_; + bool external_stream_ = false; + hipStream_t stream_ = nullptr; struct DeferredReleaseCPUPtrs { bool recorded = false; @@ -88,7 +105,7 @@ class ROCMExecutionProvider : public IExecutionProvider { class PerThreadContext final { public: - PerThreadContext(OrtDevice::DeviceId device_id, size_t hip_mem_limit, ArenaExtendStrategy arena_extend_strategy); + PerThreadContext(OrtDevice::DeviceId device_id, hipStream_t stream, size_t hip_mem_limit, ArenaExtendStrategy arena_extend_strategy); ~PerThreadContext(); rocblas_handle RocblasHandle() const { @@ -109,17 +126,17 @@ class ROCMExecutionProvider : public IExecutionProvider { if (!constant_ones_float_) { constant_ones_float_ = rocm::CreateConstantOnes(); } - return reinterpret_cast(constant_ones_float_->GetBuffer(count)); + return reinterpret_cast(constant_ones_float_->GetBuffer(stream_, count)); } else if (std::is_same::value) { if (!constant_ones_double_) { constant_ones_double_ = rocm::CreateConstantOnes(); } - return reinterpret_cast(constant_ones_double_->GetBuffer(count)); + return reinterpret_cast(constant_ones_double_->GetBuffer(stream_, count)); } else if (std::is_same::value) { if (!constant_ones_half_) { constant_ones_half_ = rocm::CreateConstantOnes(); } - return reinterpret_cast(constant_ones_half_->GetBuffer(count)); + return reinterpret_cast(constant_ones_half_->GetBuffer(stream_, count)); } else { return nullptr; } @@ -130,6 +147,7 @@ class ROCMExecutionProvider : public IExecutionProvider { } private: + hipStream_t stream_ = nullptr; rocblas_handle rocblas_handle_ = nullptr; miopenHandle_t miopen_handle_ = nullptr; diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h index 4bf12499d9..3c2383a467 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h @@ -15,6 +15,9 @@ struct ROCMExecutionProviderInfo { OrtDevice::DeviceId device_id{0}; size_t hip_mem_limit{std::numeric_limits::max()}; ArenaExtendStrategy arena_extend_strategy{ArenaExtendStrategy::kNextPowerOfTwo}; + bool do_copy_in_default_stream{true}; + bool has_user_compute_stream{false}; + void* user_compute_stream{nullptr}; static ROCMExecutionProviderInfo FromProviderOptions(const ProviderOptions& options); static ProviderOptions ToProviderOptions(const ROCMExecutionProviderInfo& info); diff --git a/onnxruntime/core/providers/rocm/rocm_kernel.h b/onnxruntime/core/providers/rocm/rocm_kernel.h index f70d7136f4..6c63ded1dd 100644 --- a/onnxruntime/core/providers/rocm/rocm_kernel.h +++ b/onnxruntime/core/providers/rocm/rocm_kernel.h @@ -58,6 +58,8 @@ class RocmKernel : public OpKernel { const hipDeviceProp_t& GetDeviceProp() const { return provider_->GetDeviceProp(); }; + inline hipStream_t Stream() const { return static_cast(provider_->GetComputeStream()); } + // To support hipMemcpyAsync, the cpu memory should be allocated in pinned memory // and it can only be released after the copy has finished template @@ -91,7 +93,7 @@ class RocmKernel : public OpKernel { Status CopyToGpu() { if (cpu_pinned_copy_) { gpu_copy_ = op_kernel_->GetScratchBuffer(count_); - HIP_RETURN_IF_ERROR(hipMemcpyAsync(gpu_copy_.get(), cpu_pinned_copy_.get(), count_ * sizeof(T), hipMemcpyHostToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(gpu_copy_.get(), cpu_pinned_copy_.get(), count_ * sizeof(T), hipMemcpyHostToDevice, op_kernel_->Stream())); op_kernel_->AddDeferredReleaseCPUPtr(cpu_pinned_copy_.release()); } return Status::OK(); diff --git a/onnxruntime/core/providers/rocm/rocm_utils.cu b/onnxruntime/core/providers/rocm/rocm_utils.cu index 923f5e64fd..3acbe88015 100644 --- a/onnxruntime/core/providers/rocm/rocm_utils.cu +++ b/onnxruntime/core/providers/rocm/rocm_utils.cu @@ -27,10 +27,10 @@ __global__ void _Fill( } template -void Fill(T* output, T value, int64_t count) { +void Fill(hipStream_t stream, T* output, T value, int64_t count) { int blocksPerGrid = static_cast(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); HIP_LONG N = static_cast(count); - hipLaunchKernelGGL(HIP_KERNEL_NAME(_Fill), dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, 0, output, value, N); + hipLaunchKernelGGL(HIP_KERNEL_NAME(_Fill), dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, stream, output, value, N); } template class ConstantBufferImpl : public IConstantBuffer { @@ -42,7 +42,7 @@ class ConstantBufferImpl : public IConstantBuffer { hipFree(buffer_); } - virtual const T* GetBuffer(size_t count) { + virtual const T* GetBuffer(hipStream_t stream, size_t count) { if (count > count_) { if (buffer_) { hipFree(buffer_); @@ -51,7 +51,7 @@ class ConstantBufferImpl : public IConstantBuffer { HIP_CALL_THROW(hipMalloc(&buffer_, count * sizeof(T))); count_ = count; - Fill(buffer_, val_, count); + Fill(stream, buffer_, val_, count); } return buffer_; } @@ -72,7 +72,7 @@ template std::unique_ptr> CreateConstantOnes(); template std::unique_ptr> CreateConstantOnes(); #define SPECIALIZED_FILL(T) \ - template void Fill(T * output, T value, int64_t count); + template void Fill(hipStream_t stream, T * output, T value, int64_t count); SPECIALIZED_FILL(int8_t) SPECIALIZED_FILL(int16_t) diff --git a/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h b/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h index c61d57ddb9..3fb52c2421 100644 --- a/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h +++ b/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h @@ -206,19 +206,19 @@ inline rocblas_status rocblasGemmStridedBatchedHelper(rocblas_handle handle, } // transpose using geam -inline rocblas_status rocblasTransposeHelper(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb, int m, int n, const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb, float* C, int ldc) { +inline rocblas_status rocblasTransposeHelper(hipStream_t /*stream*/, rocblas_handle handle, rocblas_operation transa, rocblas_operation transb, int m, int n, const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb, float* C, int ldc) { return rocblas_sgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); } -inline rocblas_status rocblasTransposeHelper(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb, int m, int n, const double* alpha, const double* A, int lda, const double* beta, const double* B, int ldb, double* C, int ldc) { +inline rocblas_status rocblasTransposeHelper(hipStream_t /*stream*/, rocblas_handle handle, rocblas_operation transa, rocblas_operation transb, int m, int n, const double* alpha, const double* A, int lda, const double* beta, const double* B, int ldb, double* C, int ldc) { return rocblas_dgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); } -rocblas_status rocblasTransposeHelper(rocblas_handle, rocblas_operation , rocblas_operation , int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int); +rocblas_status rocblasTransposeHelper(hipStream_t stream, rocblas_handle, rocblas_operation , rocblas_operation , int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int); // copy -inline rocblas_status rocblasCopyHelper(rocblas_handle handle, int n, const float* x, int incx, float* y, int incy) { +inline rocblas_status rocblasCopyHelper(hipStream_t /*stream*/, rocblas_handle handle, int n, const float* x, int incx, float* y, int incy) { return rocblas_scopy(handle, n, x, incx, y, incy); } -inline rocblas_status rocblasCopyHelper(rocblas_handle handle, int n, const double* x, int incx, double* y, int incy) { +inline rocblas_status rocblasCopyHelper(hipStream_t /*stream*/, rocblas_handle handle, int n, const double* x, int incx, double* y, int incy) { return rocblas_dcopy(handle, n, x, incx, y, incy); } -rocblas_status rocblasCopyHelper(rocblas_handle handle, int n, const half* x, int incx, half* y, int incy); +rocblas_status rocblasCopyHelper(hipStream_t stream, rocblas_handle handle, int n, const half* x, int incx, half* y, int incy); diff --git a/onnxruntime/core/providers/rocm/tensor/gather_nd_impl.cu b/onnxruntime/core/providers/rocm/tensor/gather_nd_impl.cu index 332e9befa3..5d70cbe7d4 100644 --- a/onnxruntime/core/providers/rocm/tensor/gather_nd_impl.cu +++ b/onnxruntime/core/providers/rocm/tensor/gather_nd_impl.cu @@ -52,6 +52,7 @@ __global__ void _GatherNDKernel( template void ComputeSliceOffsetsImpl( + hipStream_t stream, const int64_t batch_dims, const TArray input_dims, const size_t num_slices, @@ -62,7 +63,7 @@ void ComputeSliceOffsetsImpl( const TIndex* const indices_data, // num_slices * num_slice_dims elements int64_t* const input_slice_offsets_data) { // num_slices elements const auto blocks_per_grid = CeilDiv(num_slices, GridDim::maxThreadsPerBlock); - hipLaunchKernelGGL(_ComputeSliceOffsetsKernel, dim3(blocks_per_grid), dim3(GridDim::maxThreadsPerBlock), 0, 0, + hipLaunchKernelGGL(_ComputeSliceOffsetsKernel, dim3(blocks_per_grid), dim3(GridDim::maxThreadsPerBlock), 0, stream, batch_dims, input_dims, num_slices, @@ -76,18 +77,20 @@ void ComputeSliceOffsetsImpl( template void GatherNDImpl( + hipStream_t stream, const size_t num_slices, const void* input_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data) { const auto blocks_per_grid = CeilDiv(num_slices * slice_size, GridDim::maxThreadsPerBlock); - hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherNDKernel), dim3(blocks_per_grid), dim3(GridDim::maxThreadsPerBlock), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherNDKernel), dim3(blocks_per_grid), dim3(GridDim::maxThreadsPerBlock), 0, stream, num_slices, static_cast(input_data), static_cast(output_data), slice_size, input_slice_offsets_data); } #define SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(TIndex) \ template void ComputeSliceOffsetsImpl( \ + hipStream_t stream, \ const int64_t batch_dims, \ const TArray input_dims, \ const size_t num_slices, \ @@ -99,7 +102,7 @@ void GatherNDImpl( int64_t* const input_slice_offsets_data); #define SPECIALIZED_IMPL(T) \ - template void GatherNDImpl(const size_t num_slices, const void* input_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data); + template void GatherNDImpl(hipStream_t stream, const size_t num_slices, const void* input_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data); SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(int32_t) SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(int64_t) diff --git a/onnxruntime/core/providers/rocm/tensor/transpose.cc b/onnxruntime/core/providers/rocm/tensor/transpose.cc index 9b59b3acf6..38b2a9cef1 100644 --- a/onnxruntime/core/providers/rocm/tensor/transpose.cc +++ b/onnxruntime/core/providers/rocm/tensor/transpose.cc @@ -54,14 +54,15 @@ static std::tuple TryTransposeWithRocblas(const std::vector& p } template -Status TransposeWithRocblas(rocblas_handle rocblas_handle, const Tensor& input, Tensor& output, int M, int N) { +Status TransposeWithRocblas(hipStream_t stream, rocblas_handle rocblas_handle, const Tensor& input, Tensor& output, int M, int N) { typedef typename ToHipType::MappedType HipT; HipT one = ToHipType::FromFloat(1.0f); HipT zero = ToHipType::FromFloat(0.0f); const HipT* input_data = reinterpret_cast(input.Data()); HipT* output_data = reinterpret_cast(output.MutableData()); ROCBLAS_RETURN_IF_ERROR( - rocblasTransposeHelper(rocblas_handle, + rocblasTransposeHelper(stream, + rocblas_handle, rocblas_operation_transpose, rocblas_operation_transpose, M, N, &one, input_data, @@ -76,10 +77,11 @@ Status TransposeWithRocblas(rocblas_handle rocblas_handle, const Tensor& input, Status Transpose::DoTranspose(const Transpose& transpose_kernel, const std::vector& permutations, const Tensor& input, Tensor& output) { - return Transpose::DoTranspose(transpose_kernel.GetDeviceProp(), transpose_kernel.RocblasHandle(), permutations, input, output); + return Transpose::DoTranspose(transpose_kernel.GetDeviceProp(), transpose_kernel.Stream(), transpose_kernel.RocblasHandle(), permutations, input, output); } Status Transpose::DoTranspose(const hipDeviceProp_t& prop, + hipStream_t stream, const rocblas_handle rocblas_handle, const std::vector& permutations, const Tensor& input, Tensor& output, const TensorShape* input_shape_override) { @@ -96,11 +98,11 @@ Status Transpose::DoTranspose(const hipDeviceProp_t& prop, int N = std::get<1>(mn); if (M != 0 && N != 0) { if (element_type == utils::GetONNXTensorElementDataType()) { - return TransposeWithRocblas(rocblas_handle, input, output, M, N); + return TransposeWithRocblas(stream, rocblas_handle, input, output, M, N); } else if (element_type == utils::GetONNXTensorElementDataType()) { - return TransposeWithRocblas(rocblas_handle, input, output, M, N); + return TransposeWithRocblas(stream, rocblas_handle, input, output, M, N); } else { - return TransposeWithRocblas(rocblas_handle, input, output, M, N); + return TransposeWithRocblas(stream, rocblas_handle, input, output, M, N); } } } @@ -162,14 +164,14 @@ Status Transpose::DoTranspose(const hipDeviceProp_t& prop, size_t element_size = input.DataType()->Size(); if (CanDoTranspose3D(new_rank, new_input_dims, new_permutations)) { - return Transpose3DImpl(element_size, input_shape, tmp_input_strides, + return Transpose3DImpl(stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), output.MutableDataRaw(), output.Shape().Size()); } else if (CanDoTranspose4D(prop, element_size, new_rank, new_input_dims, new_permutations)) { TArray tmp_output_strides(new_rank); for (auto i = 0; i < new_rank; i++) { tmp_output_strides[i] = new_output_strides[new_permutations[i]]; } - return Transpose4DImpl(element_size, input_shape, tmp_input_strides, input.DataRaw(), + return Transpose4DImpl(stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), tmp_output_strides, output.MutableDataRaw(), output.Shape().Size()); } @@ -184,7 +186,7 @@ Status Transpose::DoTranspose(const hipDeviceProp_t& prop, output_strides[i] = fast_divmod(gsl::narrow_cast(new_output_strides[i])); } - auto status = TransposeImpl(element_size, new_rank, input_strides, input.DataRaw(), + auto status = TransposeImpl(stream, element_size, new_rank, input_strides, input.DataRaw(), output_strides, output.MutableDataRaw(), output.Shape().Size()); return status; @@ -208,7 +210,7 @@ Status Transpose::ComputeInternal(OpKernelContext* ctx) const { TensorShape output_shape{output_dims}; Tensor* Y = ctx->Output(0, output_shape); - return DoTranspose(this->GetDeviceProp(), this->RocblasHandle(), *p_perm, X, *Y); + return DoTranspose(this->GetDeviceProp(), this->Stream(), this->RocblasHandle(), *p_perm, X, *Y); } } // namespace rocm diff --git a/onnxruntime/core/providers/rocm/tensor/transpose.h b/onnxruntime/core/providers/rocm/tensor/transpose.h index 08b7fd3436..81410fac72 100644 --- a/onnxruntime/core/providers/rocm/tensor/transpose.h +++ b/onnxruntime/core/providers/rocm/tensor/transpose.h @@ -23,6 +23,7 @@ class Transpose final : public RocmKernel, public TransposeBase { // `input_shape_override` (if provided) overrides the shape of `input` for compute purposes static Status DoTranspose(const hipDeviceProp_t& prop, + hipStream_t stream, const rocblas_handle rocblas_handle, const std::vector& permutations, const Tensor& input, Tensor& output, const TensorShape* input_shape_override = nullptr); diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h index 306c5f945e..2313eb02b8 100644 --- a/onnxruntime/core/providers/shared_library/provider_api.h +++ b/onnxruntime/core/providers/shared_library/provider_api.h @@ -204,7 +204,7 @@ std::unique_ptr CreateCPUAllocator(const OrtMemoryInfo& memory_info) std::unique_ptr CreateCUDAAllocator(int16_t device_id, const char* name); std::unique_ptr CreateCUDAPinnedAllocator(int16_t device_id, const char* name); -std::unique_ptr CreateGPUDataTransfer(); +std::unique_ptr CreateGPUDataTransfer(void* stream); std::string GetEnvironmentVar(const std::string& var_name); diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc index aa8540f5b9..fbb061472c 100644 --- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc +++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc @@ -163,8 +163,8 @@ std::unique_ptr CreateCUDAPinnedAllocator(int16_t device_id, const c return g_host->CreateCUDAPinnedAllocator(device_id, name); } -std::unique_ptr CreateGPUDataTransfer() { - return g_host->CreateGPUDataTransfer(); +std::unique_ptr CreateGPUDataTransfer(void* stream) { + return g_host->CreateGPUDataTransfer(stream); } #endif diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index 5babda61ce..474d133420 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -4,6 +4,10 @@ // Public wrappers around internal ort interfaces (currently) // In the future the internal implementations could derive from these to remove the need for the wrapper implementations +#ifdef USE_TENSORRT +#include +#endif + #define PROVIDER_DISALLOW_ALL(TypeName) \ TypeName() = delete; \ TypeName(const TypeName&) = delete; \ @@ -127,10 +131,10 @@ struct ProviderHost { #ifdef USE_TENSORRT virtual std::unique_ptr CreateCUDAAllocator(int16_t device_id, const char* name) = 0; virtual std::unique_ptr CreateCUDAPinnedAllocator(int16_t device_id, const char* name) = 0; - virtual std::unique_ptr CreateGPUDataTransfer() = 0; + virtual std::unique_ptr CreateGPUDataTransfer(void* stream) = 0; - virtual void cuda__Impl_Cast(const int64_t* input_data, int32_t* output_data, size_t count) = 0; - virtual void cuda__Impl_Cast(const int32_t* input_data, int64_t* output_data, size_t count) = 0; + virtual void cuda__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) = 0; + virtual void cuda__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) = 0; virtual bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) = 0; virtual bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) = 0; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 1b420ccd66..ea2d88a749 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -1,6 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. - #include #include #include @@ -270,16 +269,18 @@ namespace onnxruntime { namespace cuda { template <> void Impl_Cast( + cudaStream_t stream, const int64_t* input_data, int32_t* output_data, size_t count) { - return g_host->cuda__Impl_Cast(input_data, output_data, count); + return g_host->cuda__Impl_Cast(static_cast(stream), input_data, output_data, count); } template <> void Impl_Cast( + cudaStream_t stream, const int32_t* input_data, int64_t* output_data, size_t count) { - return g_host->cuda__Impl_Cast(input_data, output_data, count); + return g_host->cuda__Impl_Cast(static_cast(stream), input_data, output_data, count); } } // namespace cuda @@ -373,6 +374,12 @@ TensorrtLogger& GetTensorrtLogger() { TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProviderInfo& info) : IExecutionProvider{onnxruntime::kTensorrtExecutionProvider, true}, device_id_(info.device_id) { CUDA_CALL_THROW(cudaSetDevice(device_id_)); + if (info.has_user_compute_stream) { + external_stream_ = true; + stream_ = static_cast(info.user_compute_stream); + } else { + CUDA_CALL_THROW(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); + } // Get environment variables const std::string max_partition_iterations_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kMaxPartitionIterations); @@ -438,7 +445,11 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv } } -TensorrtExecutionProvider::~TensorrtExecutionProvider() {} +TensorrtExecutionProvider::~TensorrtExecutionProvider() { + if (!external_stream_ && stream_) { + CUDA_CALL(cudaStreamDestroy(stream_)); + } +} AllocatorPtr TensorrtExecutionProvider::GetAllocator(int id, OrtMemType mem_type) const { if (mem_type == OrtMemTypeDefault) { @@ -472,7 +483,24 @@ void TensorrtExecutionProvider::RegisterAllocator(std::shared_ptr TensorrtExecutionProvider::GetDataTransfer() const { - return onnxruntime::CreateGPUDataTransfer(); + return onnxruntime::CreateGPUDataTransfer(static_cast(GetComputeStream())); +} + +Status TensorrtExecutionProvider::OnRunEnd() { + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(static_cast(GetComputeStream()))); + return Status::OK(); +} + +Status TensorrtExecutionProvider::SetComputeStream(void* stream) { + if (stream != stream_) { + if (stream_) { + CUDA_RETURN_IF_ERROR(cudaStreamDestroy(stream_)); + } + + external_stream_ = true; + stream_ = static_cast(stream); + } + return Status::OK(); } // Convert GraphViewer graph to GraphProto @@ -1158,7 +1186,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse }; // Create compute function - compute_info.compute_func = [](FunctionState state, const OrtCustomOpApi* api, OrtKernelContext* context) { + compute_info.compute_func = [this](FunctionState state, const OrtCustomOpApi* api, OrtKernelContext* context) { Ort::CustomOpApi ort{*api}; TensorrtFuncState* trt_state = reinterpret_cast(state); std::lock_guard lock(*(trt_state->tensorrt_mu_ptr)); @@ -1176,6 +1204,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse std::unordered_map dimension_update; std::unordered_map> tensor_shape_values; nvinfer1::IOptimizationProfile* trt_profile = nullptr; + cudaStream_t stream = static_cast(this->GetComputeStream()); // Load serialized engine const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision); @@ -1240,7 +1269,8 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse switch (tensor_type) { case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: { int32_t* input = new int32_t[shape_size]; - CUDA_RETURN_IF_ERROR(cudaMemcpy(input, ort.GetTensorData(input_tensor), shape_size * sizeof(int32_t), cudaMemcpyDeviceToHost)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input, ort.GetTensorData(input_tensor), shape_size * sizeof(int32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream)); for (int j = 0; j < shape_size; ++j) { tensor_shape_values[input_name][j] = input[j]; } @@ -1249,7 +1279,8 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse } case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: { int64_t* input = new int64_t[shape_size]; - CUDA_RETURN_IF_ERROR(cudaMemcpy(input, ort.GetTensorData(input_tensor), shape_size * sizeof(int64_t), cudaMemcpyDeviceToHost)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input, ort.GetTensorData(input_tensor), shape_size * sizeof(int64_t), cudaMemcpyDeviceToHost, stream)); + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream)); for (int j = 0; j < shape_size; ++j) { tensor_shape_values[input_name][j] = static_cast(input[j]); } @@ -1515,7 +1546,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse } scratch_buffers.push_back(IAllocator::MakeUniquePtr(alloc, input_dim_size * sizeof(int32_t))); buffers[binding_index] = scratch_buffers.back().get(); - cuda::Impl_Cast(input_tensor_ptr, reinterpret_cast(buffers[binding_index]), input_dim_size); + cuda::Impl_Cast(stream, input_tensor_ptr, reinterpret_cast(buffers[binding_index]), input_dim_size); } break; } @@ -1639,7 +1670,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse } // Run TRT inference - if (!trt_context->enqueueV2(&buffers[0], nullptr, nullptr)) { + if (!trt_context->enqueueV2(&buffers[0], stream, nullptr)) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "TensorRT EP execution context enqueue failed."); } @@ -1655,7 +1686,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) { auto output_tensor_ptr = ort.GetTensorMutableData(output_tensor[i]); if (output_tensor_ptr != nullptr) { - cuda::Impl_Cast(reinterpret_cast(buffers[binding_index]), output_tensor_ptr, output_dim_sizes[i]); + cuda::Impl_Cast(stream, reinterpret_cast(buffers[binding_index]), output_tensor_ptr, output_dim_sizes[i]); } } } diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index 7ca9e8c1fe..5bc13bcab3 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -67,6 +67,8 @@ using unique_pointer = std::unique_ptr; // Information needed to construct trt execution providers. struct TensorrtExecutionProviderInfo { int device_id{0}; + bool has_user_compute_stream{false}; + void* user_compute_stream{nullptr}; }; // Information to construct kernel function state. @@ -116,7 +118,15 @@ class TensorrtExecutionProvider : public IExecutionProvider { void RegisterAllocator(std::shared_ptr allocator_manager) override; + Status OnRunEnd() override; + + Status SetComputeStream(void* stream) override; + + void* GetComputeStream() const override { return static_cast(stream_); } + private: + bool external_stream_ = false; + cudaStream_t stream_ = nullptr; int max_partition_iterations_ = 1000; int min_subgraph_size_ = 1; size_t max_workspace_size_ = 1 << 30; // 1GB diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc index f6b569b0a0..66bc8e517f 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc @@ -13,30 +13,43 @@ namespace onnxruntime { void Shutdown_DeleteRegistry(); struct TensorrtProviderFactory : IExecutionProviderFactory { - TensorrtProviderFactory(int device_id) : device_id_(device_id) {} + TensorrtProviderFactory(const TensorrtExecutionProviderInfo& info) : info_{info} {} ~TensorrtProviderFactory() override {} std::unique_ptr CreateProvider() override; private: - int device_id_; + TensorrtExecutionProviderInfo info_; }; std::unique_ptr TensorrtProviderFactory::CreateProvider() { - TensorrtExecutionProviderInfo info; - info.device_id = device_id_; - return onnxruntime::make_unique(info); + return onnxruntime::make_unique(info_); } std::shared_ptr CreateExecutionProviderFactory_Tensorrt(int device_id) { - return std::make_shared(device_id); + TensorrtExecutionProviderInfo info; + info.device_id = device_id; + return std::make_shared(info); +} + +std::shared_ptr CreateExecutionProviderFactory_Tensorrt(const TensorrtExecutionProviderInfo& info) { + return std::make_shared(info); } struct Tensorrt_Provider : Provider { std::shared_ptr CreateExecutionProviderFactory(int device_id) override { - //TODO: This is apparently a bug. The consructor parameter is create-arena-flag, not the device-id - // Will be fixed by PR #2850 - return std::make_shared(device_id); + TensorrtExecutionProviderInfo info; + info.device_id = device_id; + return std::make_shared(info); + } + + std::shared_ptr CreateExecutionProviderFactory(const void* provider_options) override { + auto& options = *reinterpret_cast(provider_options); + TensorrtExecutionProviderInfo info; + info.device_id = options.device_id; + info.has_user_compute_stream = options.has_user_compute_stream; + info.user_compute_stream = options.user_compute_stream; + return std::make_shared(info); } void Shutdown() override { diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index aa89b84c40..65316cc686 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -429,6 +429,11 @@ common::Status InferenceSession::RegisterExecutionProvider(std::unique_ptrSetComputeStream(trt_ep->GetComputeStream()); + } } VLOGS(*session_logger_, 1) << "Adding execution provider of type: " << provider_type; diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index 2ee7949503..3bc51f11da 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -1850,6 +1850,15 @@ ORT_API(void, OrtApis::ReleaseArenaCfg, _Frees_ptr_opt_ OrtArenaCfg* ptr) { delete ptr; } +#if defined(ORT_MINIMAL_BUILD) +ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT, + _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options) { + ORT_UNUSED_PARAMETER(options); + ORT_UNUSED_PARAMETER(tensorrt_options); + return CreateStatus(ORT_FAIL, "TensorRT execution provider is not enabled."); +} +#endif + static constexpr OrtApiBase ort_api_base = { &OrtApis::GetApi, &OrtApis::GetVersionString, @@ -2084,6 +2093,7 @@ static constexpr OrtApi ort_api_1_to_7 = { // Version 7 - In development, feel free to add/remove/rearrange here &OrtApis::ModelMetadataGetGraphDescription, + &OrtApis::SessionOptionsAppendExecutionProvider_TensorRT, }; // Assert to do a limited check to ensure Version 1 of OrtApi never changes (will detect an addition or deletion but not if they cancel out each other) diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h index fa8527d7c9..2418ff8909 100644 --- a/onnxruntime/core/session/ort_apis.h +++ b/onnxruntime/core/session/ort_apis.h @@ -255,4 +255,6 @@ ORT_API_STATUS_IMPL(SetGlobalDenormalAsZero, _Inout_ OrtThreadingOptions* option ORT_API_STATUS_IMPL(CreateArenaCfg, _In_ size_t max_mem, int arena_extend_strategy, int initial_chunk_size_bytes, int max_dead_bytes_per_chunk, _Outptr_ OrtArenaCfg** out); ORT_API(void, ReleaseArenaCfg, _Frees_ptr_opt_ OrtArenaCfg*); +ORT_API_STATUS_IMPL(SessionOptionsAppendExecutionProvider_TensorRT, + _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options); } // namespace OrtApis diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index 32c7e0f317..f08af799b9 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -188,7 +188,7 @@ std::string nuphar_settings; const OrtDevice::DeviceType OrtDevice::GPU; namespace onnxruntime { -std::shared_ptr CreateExecutionProviderFactory_Tensorrt(int device_id); +std::shared_ptr CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* params); std::shared_ptr CreateExecutionProviderFactory_MIGraphX(int device_id); std::shared_ptr CreateExecutionProviderFactory_Dnnl(int use_arena); std::shared_ptr CreateExecutionProviderFactory_OpenVINO(const OrtOpenVINOProviderOptions* params); @@ -501,7 +501,8 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector sess->GetSessionOptions().enable_cpu_mem_arena)); } else if (type == kTensorrtExecutionProvider) { #ifdef USE_TENSORRT - RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Tensorrt(0)); + OrtTensorRTProviderOptions params{0, 0, nullptr}; + RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Tensorrt(¶ms)); #endif } else if (type == kMIGraphXExecutionProvider) { #ifdef USE_MIGRAPHX @@ -845,7 +846,11 @@ void addGlobalMethods(py::module& m, Environment& env) { onnxruntime::CreateExecutionProviderFactory_OpenVINO(openvino_device_type, false, "", 8), #endif #ifdef USE_TENSORRT - onnxruntime::CreateExecutionProviderFactory_Tensorrt(0), + onnxruntime::CreateExecutionProviderFactory_Tensorrt( + [&]() { + TensorrtExecutionProviderInfo info{}; + return info; + }()), #endif #ifdef USE_MIGRAPHX onnxruntime::CreateExecutionProviderFactory_MIGraphX(0), diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc index f436e3020e..4db59b3bc8 100644 --- a/onnxruntime/test/framework/inference_session_test.cc +++ b/onnxruntime/test/framework/inference_session_test.cc @@ -330,7 +330,8 @@ void RunModelWithBindingMatMul(InferenceSession& session_object, std::unique_ptr cpu_tensor = onnxruntime::make_unique(element_type, shape, cpu_allocator); - st = GPUDataTransfer().CopyTensor(rtensor, *cpu_tensor.get(), 0); + cudaStream_t stream = static_cast(static_cast(TestCudaExecutionProvider())->GetComputeStream()); + st = GPUDataTransfer(stream).CopyTensor(rtensor, *cpu_tensor.get(), 0); ASSERT_TRUE(st.IsOK()); OrtValue ml_value; ml_value.Init(cpu_tensor.release(), diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index 5c424a6104..4e4830dd08 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -304,8 +304,22 @@ int real_main(int argc, char* argv[], Ort::Env& env) { if (enable_tensorrt) { #ifdef USE_TENSORRT - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, device_id)); - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, device_id)); + OrtTensorRTProviderOptions tensorrt_options{ + 0, + 0, + nullptr}; + + OrtCUDAProviderOptions cuda_options{ + 0, + OrtCudnnConvAlgoSearch::EXHAUSTIVE, + std::numeric_limits::max(), + 0, + true, + 0, + nullptr}; + + sf.AppendExecutionProvider_TensorRT(tensorrt_options); + sf.AppendExecutionProvider_CUDA(cuda_options); #else fprintf(stderr, "TensorRT is not supported in this build"); return -1; @@ -328,7 +342,9 @@ int real_main(int argc, char* argv[], Ort::Env& env) { OrtCudnnConvAlgoSearch::EXHAUSTIVE, std::numeric_limits::max(), 0, - true}; + true, + 0, + nullptr}; sf.AppendExecutionProvider_CUDA(cuda_options); #else fprintf(stderr, "CUDA is not supported in this build"); diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index a573f05691..94a0a97e11 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -46,7 +46,9 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device static_cast(performance_test_config.run_config.cudnn_conv_algo), std::numeric_limits::max(), 0, - !performance_test_config.run_config.do_cuda_copy_in_separate_stream}; + !performance_test_config.run_config.do_cuda_copy_in_separate_stream, + 0, + nullptr}; session_options.AppendExecutionProvider_CUDA(cuda_options); #else ORT_THROW("CUDA is not supported in this build\n"); diff --git a/onnxruntime/test/providers/cuda/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/reduction_functions_test.cc index bfd384b563..23a37a5e80 100644 --- a/onnxruntime/test/providers/cuda/reduction_functions_test.cc +++ b/onnxruntime/test/providers/cuda/reduction_functions_test.cc @@ -71,18 +71,21 @@ void TestReduceRowToScalarApis(int size, float relative_error_tolerance = 1e-4f) cudaMemcpy(device_input.get(), input.data(), size * sizeof(float), cudaMemcpyHostToDevice); ASSERT_STATUS_OK(reduce_sum( + 0, device_input.get(), device_output_sum.get(), size, buffer.get(), buffer_size_in_bytes)); ASSERT_STATUS_OK(reduce_square_sum( + 0, device_input.get(), device_output_square_sum.get(), size, buffer.get(), buffer_size_in_bytes)); ASSERT_STATUS_OK(reduce_mean( + 0, device_input.get(), device_output_mean.get(), size, @@ -121,11 +124,11 @@ void TestReduceRowsToRow(int m, int n, bool reset_initial_output, float relative if (!reset_initial_output) { // manually initialize output data - Fill(d_out.get(), initial_value, n); + Fill(0, d_out.get(), initial_value, n); } ASSERT_STATUS_OK(reduce_matrix_rows( - d_in.get(), d_out.get(), + 0, d_in.get(), d_out.get(), m, n, reset_initial_output)); @@ -164,6 +167,7 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e auto d_buffer = AllocateDeviceMemory(buffer_size_in_bytes); ASSERT_STATUS_OK(reduce_matrix_columns( + 0, d_in.get(), d_out.get(), m, n, d_buffer.get(), buffer_size_in_bytes)); @@ -223,6 +227,7 @@ TEST(ReductionFunctionsTest, BufferOffsets) { cudaMemcpy(d_input.get(), input.data(), m * n * sizeof(double), cudaMemcpyHostToDevice); ASSERT_STATUS_OK(reduce_matrix_columns( + 0, d_input.get(), d_output.get(), m, n, d_buffer.get() + buffer_offset, @@ -250,7 +255,7 @@ TEST(ReductionFunctionsTest, InvalidBufferSize) { cudaMemcpy(d_input.get(), input.data(), m * n * sizeof(float), cudaMemcpyHostToDevice); const auto status = - reduce_matrix_columns(d_input.get(), d_output.get(), m, n, d_buffer.get(), buffer_size_in_bytes); + reduce_matrix_columns(0, d_input.get(), d_output.get(), m, n, d_buffer.get(), buffer_size_in_bytes); ASSERT_FALSE(status.IsOK()); } diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index 786eccced9..4116cb8f48 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -724,6 +724,9 @@ TEST(CApiTest, io_binding_cuda) { Ort::Value bound_y = Ort::Value::CreateTensor(info_cuda, reinterpret_cast(output_data.get()), expected_y.size(), expected_y_shape.data(), expected_y_shape.size()); + // Sychronize to make sure the copy on default stream is done since TensorRT isn't using default stream. + cudaStreamSynchronize(nullptr); + Ort::IoBinding binding(session); binding.BindInput("X", bound_x); binding.BindOutput("Y", bound_y); diff --git a/onnxruntime/test/shared_lib/utils.cc b/onnxruntime/test/shared_lib/utils.cc index b27c9e8228..cfa7c7139b 100644 --- a/onnxruntime/test/shared_lib/utils.cc +++ b/onnxruntime/test/shared_lib/utils.cc @@ -27,6 +27,7 @@ void MyCustomKernel::Compute(OrtKernelContext* context) { // Do computation #ifdef USE_CUDA cuda_add(size, out, X, Y); + cudaStreamSynchronize(nullptr); #else for (int64_t i = 0; i < size; i++) { out[i] = X[i] + Y[i]; diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc index a0d69269a2..897d14ef79 100644 --- a/onnxruntime/test/util/default_providers.cc +++ b/onnxruntime/test/util/default_providers.cc @@ -25,7 +25,7 @@ std::shared_ptr CreateExecutionProviderFactory_OpenVI std::shared_ptr CreateExecutionProviderFactory_Nuphar(bool, const char*); std::shared_ptr CreateExecutionProviderFactory_Nnapi(uint32_t); std::shared_ptr CreateExecutionProviderFactory_Rknpu(); -std::shared_ptr CreateExecutionProviderFactory_Tensorrt(int device_id); +std::shared_ptr CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* params); std::shared_ptr CreateExecutionProviderFactory_MIGraphX(int device_id); std::shared_ptr CreateExecutionProviderFactory_ACL(int use_arena); std::shared_ptr CreateExecutionProviderFactory_ArmNN(int use_arena); @@ -43,7 +43,8 @@ std::unique_ptr DefaultCpuExecutionProvider(bool enable_aren std::unique_ptr DefaultTensorrtExecutionProvider() { #ifdef USE_TENSORRT - if (auto factory = CreateExecutionProviderFactory_Tensorrt(0)) + OrtTensorRTProviderOptions params{0, 0, nullptr}; + if (auto factory = CreateExecutionProviderFactory_Tensorrt(¶ms)) return factory->CreateProvider(); #endif return nullptr; diff --git a/orttraining/orttraining/test/training_ops/cuda/cuda_utils_test.cc b/orttraining/orttraining/test/training_ops/cuda/cuda_utils_test.cc index 3a4142db6d..51141b3eb4 100644 --- a/orttraining/orttraining/test/training_ops/cuda/cuda_utils_test.cc +++ b/orttraining/orttraining/test/training_ops/cuda/cuda_utils_test.cc @@ -31,7 +31,7 @@ void TestFillCorrectness(size_t num_elements, TElement value) { std::unique_ptr buffer{ reinterpret_cast(raw_buffer)}; - Fill(buffer.get(), value, num_elements); + Fill(nullptr, buffer.get(), value, num_elements); auto cpu_buffer = onnxruntime::make_unique(num_elements); CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement), cudaMemcpyKind::cudaMemcpyDeviceToHost)); diff --git a/orttraining/orttraining/training_ops/cuda/activation/activations_grad.cc b/orttraining/orttraining/training_ops/cuda/activation/activations_grad.cc index 7e4f344f44..1049079082 100644 --- a/orttraining/orttraining/training_ops/cuda/activation/activations_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/activation/activations_grad.cc @@ -24,13 +24,13 @@ namespace cuda { Status x::ComputeInternal(OpKernelContext* context) const { \ BinaryElementwisePreparation prepare; \ ORT_RETURN_IF_ERROR(Prepare(context, &prepare)); \ - CudaAsyncBuffer func_ctx(this, MakeFuncCtx(), 1); \ - if (!std::is_same::value) ORT_RETURN_IF_ERROR(func_ctx.CopyToGpu()); \ + Ctx##x func_ctx = MakeFuncCtx(); \ Impl_##x::MappedType>( \ + Stream(), \ reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), \ reinterpret_cast::MappedType*>(prepare.rhs_tensor->template Data()), \ reinterpret_cast::MappedType*>(prepare.output_tensor->template MutableData()), \ - func_ctx.GpuPtr(), prepare.output_tensor->Shape().Size()); \ + &func_ctx, prepare.output_tensor->Shape().Size()); \ return Status::OK(); \ } diff --git a/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.cu index caa38cac0d..2e7e3bacc2 100644 --- a/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.cu @@ -33,14 +33,15 @@ struct OP_ReluGrad : public CtxReluGrad { #define BINARY_ELEMENTWISE_IMPL(name) \ BINARY_ELEMENTWISE_IMPL_DECLARATION(name) { \ - BinaryElementWiseNoBroadcastImpl(lhs_data, rhs_data, \ + BinaryElementWiseNoBroadcastImpl(stream, \ + lhs_data, rhs_data, \ output_data, \ *reinterpret_cast*>(func_ctx), \ count); \ } #define SPECIALIZED_BINARY_ELEMENTWISE_IMPL(name, T) \ - template void Impl_##name(const T* lhs_data, const T* rhs_data, T* output_data, const Ctx##name* func_ctx, size_t count); + template void Impl_##name(cudaStream_t stream, const T* lhs_data, const T* rhs_data, T* output_data, const Ctx##name* func_ctx, size_t count); #define SPECIALIZED_BINARY_ELEMENTWISE_IMPL_HFD(x) \ SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, half) \ diff --git a/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.h b/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.h index bc5e292652..da23cb595b 100644 --- a/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.h +++ b/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.h @@ -18,7 +18,8 @@ typedef onnxruntime::cuda::CtxNull CtxReluGrad; #define BINARY_ELEMENTWISE_IMPL_DECLARATION(name) \ template \ - void Impl_##name(const T* lhs_data, \ + void Impl_##name(cudaStream_t stream, \ + const T* lhs_data, \ const T* rhs_data, \ T* output_data, \ const Ctx##name* func_ctx, \ diff --git a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad.cc b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad.cc index e219f97951..b948c5c77c 100644 --- a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad.cc @@ -39,12 +39,14 @@ ONNX_OPERATOR_KERNEL_EX( template template void BiasGeluGrad_dX::KernelLaunchDispatcher::operator()( + cudaStream_t stream, int64_t input_size, int64_t bias_size, const Tensor& dY, const Tensor& X, const Tensor& B, Tensor& dX) const { using CudaT = typename ToCudaType::MappedType; LaunchBiasGeluGradDxKernel( + stream, input_size, bias_size, reinterpret_cast(dY.template Data()), reinterpret_cast(X.template Data()), @@ -78,7 +80,7 @@ Status BiasGeluGrad_dX::ComputeInternal(OpKernelContext* co KernelLaunchDispatcher, ALL_IEEE_FLOAT_DATA_TYPES> dispatcher{X->GetElementType()}; - dispatcher.Invoke(input_size, bias_size, *dY, *X, *B, *dX); + dispatcher.Invoke(Stream(), input_size, bias_size, *dY, *X, *B, *dX); return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad.h b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad.h index 695dd85b64..695739d1fd 100644 --- a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad.h +++ b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad.h @@ -18,6 +18,7 @@ class BiasGeluGrad_dX : public CudaKernel { template struct KernelLaunchDispatcher { void operator()( + cudaStream_t stream, int64_t input_size, int64_t bias_size, const Tensor& dY, const Tensor& X, const Tensor& B, Tensor& dX) const; diff --git a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu index 2007036db7..d6fae84ca4 100644 --- a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu @@ -56,6 +56,7 @@ __global__ void BiasGeluGradDxKernel(int64_t bias_size, const T* dY, const T* X, template void LaunchBiasGeluGradDxKernel( + cudaStream_t stream, int64_t input_size, int64_t bias_size, const T* dY, const T* X, const T* B, T* dX) { // given a 2D grid of blocks: @@ -70,13 +71,13 @@ void LaunchBiasGeluGradDxKernel( const dim3 grid_dim{static_cast(grid_width), static_cast(grid_height)}; BiasGeluGradDxKernel - <<>>(bias_size, dY, X, B, dX); + <<>>(bias_size, dY, X, B, dX); } // explicit instantiations #define SPECIALIZED_BIAS_GELU_GRAD_IMPL(T, GeluComputationMode) \ template void LaunchBiasGeluGradDxKernel( \ - int64_t input_size, int64_t bias_size, \ + cudaStream_t stream, int64_t input_size, int64_t bias_size, \ const T* dY, const T* X, const T* B, T* dX) SPECIALIZED_BIAS_GELU_GRAD_IMPL(half, gelu_computation_mode::Default); diff --git a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.h b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.h index 6625bff938..a2edbb1749 100644 --- a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.h +++ b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.h @@ -14,6 +14,7 @@ namespace cuda { // - input_size % bias_size == 0 template void LaunchBiasGeluGradDxKernel( + cudaStream_t stream, int64_t input_size, int64_t bias_size, const T* dY, const T* X, const T* B, T* dX); diff --git a/orttraining/orttraining/training_ops/cuda/collective/adasum_kernels.cc b/orttraining/orttraining/training_ops/cuda/collective/adasum_kernels.cc index d43bce09ca..053d5fee1a 100644 --- a/orttraining/orttraining/training_ops/cuda/collective/adasum_kernels.cc +++ b/orttraining/orttraining/training_ops/cuda/collective/adasum_kernels.cc @@ -36,8 +36,8 @@ Status AdasumAllReduce::ComputeInternal(OpKernelContext* context) const { for (int i = 0; i < num_tensors; ++i) { const Tensor* x_tensor = context->Input(i); - CUDA_CALL(cudaMemcpy((uint8_t*)data_buffer_ptr.get() + tensor_offsets[i], x_tensor->DataRaw(), - tensor_sizes[i], cudaMemcpyDeviceToHost)); + CUDA_CALL(cudaMemcpyAsync((uint8_t*)data_buffer_ptr.get() + tensor_offsets[i], x_tensor->DataRaw(), + tensor_sizes[i], cudaMemcpyDeviceToHost, Stream())); } auto recv_buffer = allocator->Alloc(total_recv_buffer_len); @@ -52,8 +52,8 @@ Status AdasumAllReduce::ComputeInternal(OpKernelContext* context) const { for (int i = 0; i < num_tensors; i++) { Tensor* y_tensor = context->Output(i, context->Input(i)->Shape()); - CUDA_CALL(cudaMemcpy(y_tensor->MutableDataRaw(), (uint8_t*)data_buffer + tensor_offsets[i], - tensor_sizes[i], cudaMemcpyHostToDevice)); + CUDA_CALL(cudaMemcpyAsync(y_tensor->MutableDataRaw(), (uint8_t*)data_buffer + tensor_offsets[i], + tensor_sizes[i], cudaMemcpyHostToDevice, Stream())); } return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/cuda/collective/nccl_kernels.cc b/orttraining/orttraining/training_ops/cuda/collective/nccl_kernels.cc index 41775ae30e..7bd7dabdbc 100644 --- a/orttraining/orttraining/training_ops/cuda/collective/nccl_kernels.cc +++ b/orttraining/orttraining/training_ops/cuda/collective/nccl_kernels.cc @@ -10,7 +10,6 @@ NcclAllReduce::NcclAllReduce(const OpKernelInfo& info) : NcclKernel(info) { } Status NcclAllReduce::ComputeInternal(OpKernelContext* context) const { - cudaStream_t stream = nullptr; // Default stream ncclComm_t comm = nccl_->Comm(group_type_); const void* input_data = context->Input(0)->DataRaw(); @@ -32,7 +31,7 @@ Status NcclAllReduce::ComputeInternal(OpKernelContext* context) const { ncclDataType_t dtype = GetNcclDataType(onnx_type); #ifdef ORT_USE_NCCL - NCCL_RETURN_IF_ERROR(ncclAllReduce(input_data, output_data, input_count, dtype, ncclSum, comm, stream)); + NCCL_RETURN_IF_ERROR(ncclAllReduce(input_data, output_data, input_count, dtype, ncclSum, comm, Stream())); #endif return Status::OK(); } @@ -41,7 +40,6 @@ NcclAllGather::NcclAllGather(const OpKernelInfo& info) : NcclKernel(info) { } Status NcclAllGather::ComputeInternal(OpKernelContext* context) const { - cudaStream_t stream = nullptr; // Default stream ncclComm_t comm = nccl_->Comm(group_type_); const int rank = nccl_->Rank(group_type_); const int size = nccl_->Size(group_type_); @@ -86,7 +84,7 @@ Status NcclAllGather::ComputeInternal(OpKernelContext* context) const { ORT_ENFORCE(offset + tensor_bytes <= rank_end, "A single rank must be responsible for the entire tensor."); void* fusion_data_at_offset = (int8_t*)fusion_data + offset; const void* input_data = input_tensor->DataRaw(); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(fusion_data_at_offset, input_data, tensor_bytes, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(fusion_data_at_offset, input_data, tensor_bytes, cudaMemcpyDeviceToDevice, Stream())); } offset += tensor_bytes; @@ -95,7 +93,7 @@ Status NcclAllGather::ComputeInternal(OpKernelContext* context) const { // AllGather. const void* fusion_data_rank_offset = (const int8_t*)fusion_data + rank_start; #ifdef ORT_USE_NCCL - NCCL_RETURN_IF_ERROR(ncclAllGather(fusion_data_rank_offset, fusion_data, rank_count, dtype, comm, stream)); + NCCL_RETURN_IF_ERROR(ncclAllGather(fusion_data_rank_offset, fusion_data, rank_count, dtype, comm, Stream())); #endif // Copy AllGather results to outputs. @@ -113,12 +111,12 @@ Status NcclAllGather::ComputeInternal(OpKernelContext* context) const { if (offset < rank_start || offset >= rank_end) { void* output_data = output_tensor->MutableDataRaw(); const void* fusion_data_at_offset = (const int8_t*)fusion_data + offset; - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, fusion_data_at_offset, tensor_bytes, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, fusion_data_at_offset, tensor_bytes, cudaMemcpyDeviceToDevice, Stream())); } else { const void* input_data = input_tensor->DataRaw(); void* output_data = output_tensor->MutableDataRaw(); if (input_data != output_data) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, tensor_bytes, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, tensor_bytes, cudaMemcpyDeviceToDevice, Stream())); } } @@ -132,7 +130,6 @@ NcclReduceScatter::NcclReduceScatter(const OpKernelInfo& info) : NcclKernel(info } Status NcclReduceScatter::ComputeInternal(OpKernelContext* context) const { - cudaStream_t stream = nullptr; // Default stream ncclComm_t comm = nccl_->Comm(group_type_); const int rank = nccl_->Rank(group_type_); const int size = nccl_->Size(group_type_); @@ -174,7 +171,7 @@ Status NcclReduceScatter::ComputeInternal(OpKernelContext* context) const { void* fusion_data_at_offset = (int8_t*)fusion_data + offset; const void* input_data = input_tensor->DataRaw(); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(fusion_data_at_offset, input_data, tensor_bytes, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(fusion_data_at_offset, input_data, tensor_bytes, cudaMemcpyDeviceToDevice, Stream())); offset += tensor_bytes; } @@ -182,7 +179,7 @@ Status NcclReduceScatter::ComputeInternal(OpKernelContext* context) const { // ReduceScatter. void* fusion_data_rank_offset = (int8_t*)fusion_data + rank_start; #ifdef ORT_USE_NCCL - NCCL_RETURN_IF_ERROR(ncclReduceScatter(fusion_data, fusion_data_rank_offset, rank_count, dtype, ncclSum, comm, stream)); + NCCL_RETURN_IF_ERROR(ncclReduceScatter(fusion_data, fusion_data_rank_offset, rank_count, dtype, ncclSum, comm, Stream())); #endif // Copy this rank's ReduceScatter results to outputs. offset = 0; @@ -200,12 +197,12 @@ Status NcclReduceScatter::ComputeInternal(OpKernelContext* context) const { ORT_ENFORCE(offset + tensor_bytes <= rank_end, "A single rank must be responsible for the entire tensor."); void* output_data = output_tensor->MutableDataRaw(); const void* fusion_data_at_offset = (const int8_t*)fusion_data + offset; - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, fusion_data_at_offset, tensor_bytes, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, fusion_data_at_offset, tensor_bytes, cudaMemcpyDeviceToDevice, Stream())); } else { const void* input_data = input_tensor->DataRaw(); void* output_data = output_tensor->MutableDataRaw(); if (input_data != output_data) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, tensor_bytes, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, tensor_bytes, cudaMemcpyDeviceToDevice, Stream())); } } diff --git a/orttraining/orttraining/training_ops/cuda/communication/recv.cc b/orttraining/orttraining/training_ops/cuda/communication/recv.cc index 1c42b0b8d2..fb9b383cf0 100644 --- a/orttraining/orttraining/training_ops/cuda/communication/recv.cc +++ b/orttraining/orttraining/training_ops/cuda/communication/recv.cc @@ -89,11 +89,11 @@ void Recv::ReceiveData( assert(tensor_offset_in_bytes + tensor->SizeInBytes() <= aggregated_aligned_tensor_bytes); // Copy data out from buffer. #if defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) - CUDA_CALL(cudaMemcpy(tensor->MutableDataRaw(), buffer.get() + tensor_offset_in_bytes, - tensor->SizeInBytes(), cudaMemcpyDeviceToDevice)); + CUDA_CALL(cudaMemcpyAsync(tensor->MutableDataRaw(), buffer.get() + tensor_offset_in_bytes, + tensor->SizeInBytes(), cudaMemcpyDeviceToDevice, Stream())); #else - CUDA_CALL(cudaMemcpy(tensor->MutableDataRaw(), buffer.get() + tensor_offset_in_bytes, - tensor->SizeInBytes(), cudaMemcpyHostToDevice)); + CUDA_CALL(cudaMemcpyAsync(tensor->MutableDataRaw(), buffer.get() + tensor_offset_in_bytes, + tensor->SizeInBytes(), cudaMemcpyHostToDevice, Stream())); #endif #ifndef NDEBUG diff --git a/orttraining/orttraining/training_ops/cuda/communication/send.cc b/orttraining/orttraining/training_ops/cuda/communication/send.cc index 72981ce040..6a5bc71fd3 100644 --- a/orttraining/orttraining/training_ops/cuda/communication/send.cc +++ b/orttraining/orttraining/training_ops/cuda/communication/send.cc @@ -66,11 +66,11 @@ void Send::SendData( #endif #if defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) - CUDA_CALL(cudaMemcpy(buffer.get() + tensor_offsets_in_bytes[i], tensor->DataRaw(), - tensor_sizes_in_bytes[i], cudaMemcpyDeviceToDevice)); + CUDA_CALL(cudaMemcpyAsync(buffer.get() + tensor_offsets_in_bytes[i], tensor->DataRaw(), + tensor_sizes_in_bytes[i], cudaMemcpyDeviceToDevice, Stream())); #else - CUDA_CALL(cudaMemcpy(buffer.get() + tensor_offsets_in_bytes[i], tensor->DataRaw(), - tensor_sizes_in_bytes[i], cudaMemcpyDeviceToHost)); + CUDA_CALL(cudaMemcpyAsync(buffer.get() + tensor_offsets_in_bytes[i], tensor->DataRaw(), + tensor_sizes_in_bytes[i], cudaMemcpyDeviceToHost, Stream())); #endif } diff --git a/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cc b/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cc index 51d2f7bfbb..5ff75fb54b 100644 --- a/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cc +++ b/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cc @@ -92,7 +92,8 @@ Status SoftmaxCrossEntropyLoss::ComputeInternal(OpKernelContext* ctx) co } // calculate logsoftmax - auto status = SoftMaxComputeHelper(logit_data, + auto status = SoftMaxComputeHelper(Stream(), + logit_data, logit_reshape, log_prob_data, CudnnHandle(), @@ -107,8 +108,8 @@ Status SoftmaxCrossEntropyLoss::ComputeInternal(OpKernelContext* ctx) co IAllocatorUniquePtr weight_data_nd = GetScratchBuffer(N_D); T* weight_data_nd_data = weight_data_nd.get(); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(weight_data_nd_data, 0, N_D * sizeof(T))); - ComputeWeightsSoftmaxCrossEntropyImpl(label_data, weight_data, N_D, C, ignore_index_, weight_data_nd_data); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(weight_data_nd_data, 0, N_D * sizeof(T), Stream())); + ComputeWeightsSoftmaxCrossEntropyImpl(Stream(), label_data, weight_data, N_D, C, ignore_index_, weight_data_nd_data); auto normalize_factor_data = GetScratchBuffer(1); if (reduction_ == ReductionType::MEAN) { @@ -119,6 +120,7 @@ Status SoftmaxCrossEntropyLoss::ComputeInternal(OpKernelContext* ctx) co IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( buffer_size); ORT_RETURN_IF_ERROR(reduce_sum( + Stream(), weight_data_nd_data, normalize_factor_data.get(), static_cast(N_D), @@ -126,10 +128,11 @@ Status SoftmaxCrossEntropyLoss::ComputeInternal(OpKernelContext* ctx) co buffer_size)); } else { const T normalize_factor = static_cast(1); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice, Stream())); } - SoftmaxCrossEntropyLossImpl(log_prob_data, + SoftmaxCrossEntropyLossImpl(Stream(), + log_prob_data, label_data, weight_data_nd_data, normalize_factor_data.get(), @@ -148,7 +151,7 @@ Status SoftmaxCrossEntropyLoss::ComputeInternal(OpKernelContext* ctx) co transpose_output.GetMutable()->Reshape(log_prob->Shape()); log_prob->Reshape(log_prob_shape); ORT_RETURN_IF_ERROR(cuda::Transpose::DoTranspose(cuda::Transpose(info), permutations, *log_prob, *transpose_output.GetMutable())); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(log_prob_data, transposed_data, sizeof(T) * logit_shape.Size(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(log_prob_data, transposed_data, sizeof(T) * logit_shape.Size(), cudaMemcpyDeviceToDevice, Stream())); log_prob->Reshape(new_shape); } @@ -209,8 +212,8 @@ Status SoftmaxCrossEntropyLossGrad::ComputeInternal(OpKernelContext* ctx IAllocatorUniquePtr weight_data_nd = GetScratchBuffer(N_D); T* weight_data_nd_data = weight_data_nd.get(); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(weight_data_nd_data, 0, N_D * sizeof(T))); - ComputeWeightsSoftmaxCrossEntropyImpl(label_data, weight_data, N_D, C, ignore_index_, weight_data_nd_data); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(weight_data_nd_data, 0, N_D * sizeof(T), Stream())); + ComputeWeightsSoftmaxCrossEntropyImpl(Stream(), label_data, weight_data, N_D, C, ignore_index_, weight_data_nd_data); auto normalize_factor_data = GetScratchBuffer(1); if (reduction_ == ReductionType::MEAN) { // Compute buffer size in byte for reduction APIs. @@ -220,6 +223,7 @@ Status SoftmaxCrossEntropyLossGrad::ComputeInternal(OpKernelContext* ctx IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( buffer_size); ORT_RETURN_IF_ERROR(reduce_sum( + Stream(), weight_data_nd_data, normalize_factor_data.get(), static_cast(N_D), @@ -227,10 +231,11 @@ Status SoftmaxCrossEntropyLossGrad::ComputeInternal(OpKernelContext* ctx buffer_size)); } else { const T normalize_factor = static_cast(1); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice, Stream())); } - SoftmaxCrossEntropyLossGradImpl(dY_data, + SoftmaxCrossEntropyLossGradImpl(Stream(), + dY_data, log_prob_data, label_data, weight_data_nd_data, @@ -250,7 +255,7 @@ Status SoftmaxCrossEntropyLossGrad::ComputeInternal(OpKernelContext* ctx d_logit->Reshape(logit_shape); ORT_RETURN_IF_ERROR(cuda::Transpose::DoTranspose(cuda::Transpose(info), permutations, *d_logit, *transpose_output.GetMutable())); auto* transposed_data = (*transpose_output.GetMutable()).template Data(); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(d_logit_data, transposed_data, sizeof(T) * probability_shape.Size(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(d_logit_data, transposed_data, sizeof(T) * probability_shape.Size(), cudaMemcpyDeviceToDevice, Stream())); d_logit->Reshape(new_shape); } diff --git a/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cu b/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cu index aeda6fee4c..02bd7ebf25 100644 --- a/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cu @@ -25,6 +25,7 @@ __global__ void _ComputeWeightsSoftmaxCrossEntropy( template void ComputeWeightsSoftmaxCrossEntropyImpl( + cudaStream_t stream, const Tin* label, const T* weight, size_t count, @@ -35,7 +36,7 @@ void ComputeWeightsSoftmaxCrossEntropyImpl( CUDA_LONG N_D = static_cast(count); CUDA_LONG C = static_cast(label_depth); CUDA_LONG II = static_cast(ignore_index); - _ComputeWeightsSoftmaxCrossEntropy<<>>( + _ComputeWeightsSoftmaxCrossEntropy<<>>( weight_data_nd, label, weight, @@ -65,6 +66,7 @@ __global__ void _WeightedSoftmaxCrossEntropyLoss( template void SoftmaxCrossEntropyLossImpl( + cudaStream_t stream, const T* log_prob, const Tin* label, const T* weight, @@ -77,7 +79,7 @@ void SoftmaxCrossEntropyLossImpl( CUDA_LONG N_D = static_cast(count); CUDA_LONG C = static_cast(label_depth); CUDA_LONG II = static_cast(ignore_index); - _WeightedSoftmaxCrossEntropyLoss<<>>( + _WeightedSoftmaxCrossEntropyLoss<<>>( log_prob, label, weight, @@ -90,6 +92,7 @@ void SoftmaxCrossEntropyLossImpl( #define SPECIALIZED_IMPL_SoftMaxEntropyLossImpl(T, Tin) \ template void SoftmaxCrossEntropyLossImpl( \ + cudaStream_t stream, \ const T* log_prob, \ const Tin* label, \ const T* weight, \ @@ -154,6 +157,7 @@ __global__ void _WeightedReductionNoneSoftmaxCrossEntropyLossGrad( template void SoftmaxCrossEntropyLossGradImpl( + cudaStream_t stream, const T* dY, const T* log_prob, const Tin* label, @@ -167,7 +171,7 @@ void SoftmaxCrossEntropyLossGradImpl( CUDA_LONG C = static_cast(label_depth); int blocksPerGrid = (int)(ceil(static_cast(N_D * C) / GridDim::maxThreadsPerBlock)); if (reduction_none) { - _WeightedReductionNoneSoftmaxCrossEntropyLossGrad<<>>( + _WeightedReductionNoneSoftmaxCrossEntropyLossGrad<<>>( dY, log_prob, label, @@ -177,7 +181,7 @@ void SoftmaxCrossEntropyLossGradImpl( N_D, C); } else { - _WeightedSoftmaxCrossEntropyLossGrad<<>>( + _WeightedSoftmaxCrossEntropyLossGrad<<>>( dY, log_prob, label, @@ -191,6 +195,7 @@ void SoftmaxCrossEntropyLossGradImpl( #define SPECIALIZED_IMPL_SoftMaxEntropyLossGradImpl(T, Tin) \ template void SoftmaxCrossEntropyLossGradImpl( \ + cudaStream_t stream, \ const T* dY, \ const T* log_prob, \ const Tin* label, \ @@ -206,6 +211,7 @@ SPECIALIZED_IMPL_SoftMaxEntropyLossGradImpl(float, int64_t) #define SPECIALIZED_IMPL_ComputeWeightsSoftmaxCrossEntropyImpl(T, Tin) \ template void ComputeWeightsSoftmaxCrossEntropyImpl( \ + cudaStream_t stream, \ const Tin* label, \ const T* weight, \ size_t count, \ diff --git a/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.h b/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.h index 2333d7d593..d368fe9fbd 100644 --- a/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.h +++ b/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.h @@ -12,6 +12,7 @@ namespace cuda { template void SoftmaxCrossEntropyLossImpl( + cudaStream_t stream, const T* log_prob, const Tin* label, const T* weight, @@ -23,6 +24,7 @@ void SoftmaxCrossEntropyLossImpl( template void SoftmaxCrossEntropyLossGradImpl( + cudaStream_t stream, const T* dY, const T* log_prob, const Tin* label, @@ -35,6 +37,7 @@ void SoftmaxCrossEntropyLossGradImpl( template void ComputeWeightsSoftmaxCrossEntropyImpl( + cudaStream_t stream, const Tin* label, const T* weight, size_t count, diff --git a/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc b/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc index ce45bd8c7b..441a39d21d 100644 --- a/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc +++ b/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc @@ -49,7 +49,8 @@ Status SoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) const { T* log_prob_data = log_prob->template MutableData(); // calculate logsoftmax - auto status = SoftMaxComputeHelper(logit_data, + auto status = SoftMaxComputeHelper(Stream(), + logit_data, logit_reshape, log_prob_data, CudnnHandle(), @@ -64,6 +65,7 @@ Status SoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) const { // calculate (label * log(softmax)) for each element IAllocatorUniquePtr temp_X = GetScratchBuffer(N * D); SoftMaxCrossEntropyImpl( + Stream(), log_prob_data, // logsoftmax result label_data, // label normalize_factor, // normalize_factor @@ -109,6 +111,7 @@ Status SoftmaxCrossEntropyGrad::ComputeInternal(OpKernelContext* ctx) const { T* d_logits_data = d_logits->template MutableData(); SoftMaxCrossEntropyGradImpl( + Stream(), dY_data, // Dy log_prob_data, // log(pi) label_data, // Label @@ -147,7 +150,8 @@ Status SparseSoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) T* log_prob_data = log_prob->template MutableData(); // calculate logsoftmax - auto status = SoftMaxComputeHelper(logit_data, + auto status = SoftMaxComputeHelper(Stream(), + logit_data, logit_reshape, log_prob_data, CudnnHandle(), @@ -166,11 +170,11 @@ Status SparseSoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) auto normalize_factor_data = GetScratchBuffer(1); if (reduction_ == ReductionType::SUM) { const T normalize_factor = static_cast(1); - cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice, Stream())); } else if (reduction_ == ReductionType::MEAN) { if (weight_data == nullptr) { const T normalize_factor = static_cast(N); - cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice, Stream())); } else { // Compute buffer size in byte for reduction APIs. const auto buffer_size = @@ -179,6 +183,7 @@ Status SparseSoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( buffer_size); ORT_RETURN_IF_ERROR(reduce_sum( + Stream(), weight_data, normalize_factor_data.get(), static_cast(N), @@ -187,7 +192,8 @@ Status SparseSoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) } } - SparseSoftmaxCrossEntropyImpl(log_prob_data, + SparseSoftmaxCrossEntropyImpl(Stream(), + log_prob_data, label_data, weight_data, normalize_factor_data.get(), @@ -241,11 +247,11 @@ Status SparseSoftmaxCrossEntropyGrad::ComputeInternal(OpKernelContext* c auto normalize_factor_data = GetScratchBuffer(1); if (reduction_ == ReductionType::SUM) { const T normalize_factor = static_cast(1); - cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice, Stream())); } else if (reduction_ == ReductionType::MEAN) { if (weight_data == nullptr) { const T normalize_factor = static_cast(N); - cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice, Stream())); } else { // Compute buffer size in byte for reduction APIs. const auto buffer_size = @@ -254,6 +260,7 @@ Status SparseSoftmaxCrossEntropyGrad::ComputeInternal(OpKernelContext* c IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( buffer_size); ORT_RETURN_IF_ERROR(reduce_sum( + Stream(), weight_data, normalize_factor_data.get(), static_cast(N), @@ -262,7 +269,8 @@ Status SparseSoftmaxCrossEntropyGrad::ComputeInternal(OpKernelContext* c } } - SparseSoftmaxCrossEntropyGradImpl(dY_data, + SparseSoftmaxCrossEntropyGradImpl(Stream(), + dY_data, log_prob_data, label_data, weight_data, diff --git a/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cu b/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cu index cbb430418b..a9165fd9ed 100644 --- a/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cu @@ -22,6 +22,7 @@ __global__ void _SoftMaxCrossEntropy( template void SoftMaxCrossEntropyImpl( + cudaStream_t stream, const T* log_prob, const T* label, size_t normalize_factor, @@ -30,7 +31,7 @@ void SoftMaxCrossEntropyImpl( int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); CUDA_LONG NORMALIZE_FACTOR = static_cast(normalize_factor); - _SoftMaxCrossEntropy<<>>( + _SoftMaxCrossEntropy<<>>( log_prob, label, NORMALIZE_FACTOR, @@ -40,6 +41,7 @@ void SoftMaxCrossEntropyImpl( #define SPECIALIZED_IMPL_SoftMaxEntropyImpl(T) \ template void SoftMaxCrossEntropyImpl( \ + cudaStream_t stream, \ const T* log_prob, \ const T* label, \ size_t normalize_factor, \ @@ -62,6 +64,7 @@ __global__ void _SoftMaxCrossEntropyGrad( template void SoftMaxCrossEntropyGradImpl( + cudaStream_t stream, const T* dY, const T* log_prob, const T* label, @@ -71,7 +74,7 @@ void SoftMaxCrossEntropyGradImpl( int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); CUDA_LONG NORMALIZE_FACTOR = static_cast(normalize_factor); - _SoftMaxCrossEntropyGrad<<>>( + _SoftMaxCrossEntropyGrad<<>>( dY, log_prob, label, @@ -82,6 +85,7 @@ void SoftMaxCrossEntropyGradImpl( #define SPECIALIZED_IMPL_SoftMaxEntropyGradImpl(T) \ template void SoftMaxCrossEntropyGradImpl( \ + cudaStream_t stream, \ const T* dY, \ const T* log_prob, \ const T* label, \ @@ -128,6 +132,7 @@ __global__ void _WeightedSparseSoftmaxCrossEntropy( template void SparseSoftmaxCrossEntropyImpl( + cudaStream_t stream, const T* log_prob, const Tin* label, const T* weight, @@ -139,7 +144,7 @@ void SparseSoftmaxCrossEntropyImpl( CUDA_LONG N = static_cast(count); CUDA_LONG D = static_cast(label_depth); if (weight) { - _WeightedSparseSoftmaxCrossEntropy<<>>( + _WeightedSparseSoftmaxCrossEntropy<<>>( log_prob, label, weight, @@ -148,7 +153,7 @@ void SparseSoftmaxCrossEntropyImpl( N, D); } else { - _SparseSoftmaxCrossEntropy<<>>( + _SparseSoftmaxCrossEntropy<<>>( log_prob, label, normalize_factor, @@ -160,6 +165,7 @@ void SparseSoftmaxCrossEntropyImpl( #define SPECIALIZED_IMPL_SparseSoftMaxEntropyImpl(T, Tin) \ template void SparseSoftmaxCrossEntropyImpl( \ + cudaStream_t stream, \ const T* log_prob, \ const Tin* label, \ const T* weight, \ @@ -212,6 +218,7 @@ __global__ void _WeightedSparseSoftmaxCrossEntropyGrad( template void SparseSoftmaxCrossEntropyGradImpl( + cudaStream_t stream, const T* dY, const T* log_prob, const Tin* label, @@ -224,7 +231,7 @@ void SparseSoftmaxCrossEntropyGradImpl( CUDA_LONG D = static_cast(label_depth); int blocksPerGrid = (int)(ceil(static_cast(N * D) / GridDim::maxThreadsPerBlock)); if (weight) { - _WeightedSparseSoftmaxCrossEntropyGrad<<>>( + _WeightedSparseSoftmaxCrossEntropyGrad<<>>( dY, log_prob, label, @@ -234,7 +241,7 @@ void SparseSoftmaxCrossEntropyGradImpl( N, D); } else { - _SparseSoftmaxCrossEntropyGrad<<>>( + _SparseSoftmaxCrossEntropyGrad<<>>( dY, log_prob, label, @@ -247,6 +254,7 @@ void SparseSoftmaxCrossEntropyGradImpl( #define SPECIALIZED_IMPL_SparseSoftMaxEntropyGradImpl(T, Tin) \ template void SparseSoftmaxCrossEntropyGradImpl( \ + cudaStream_t stream, \ const T* dY, \ const T* log_prob, \ const Tin* label, \ diff --git a/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.h b/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.h index 6345f738a9..d41718d276 100644 --- a/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.h +++ b/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.h @@ -11,6 +11,7 @@ namespace cuda { template void SoftMaxCrossEntropyImpl( + cudaStream_t stream, const T* log_prob, const T* label, size_t normalize_factor, @@ -19,6 +20,7 @@ void SoftMaxCrossEntropyImpl( template void SoftMaxCrossEntropyGradImpl( + cudaStream_t stream, const T* dY, const T* log_prob, const T* label, @@ -28,6 +30,7 @@ void SoftMaxCrossEntropyGradImpl( template void SparseSoftmaxCrossEntropyImpl( + cudaStream_t stream, const T* log_prob, const Tin* label, const T* weight, @@ -38,6 +41,7 @@ void SparseSoftmaxCrossEntropyImpl( template void SparseSoftmaxCrossEntropyGradImpl( + cudaStream_t stream, const T* dY, const T* log_prob, const Tin* label, diff --git a/orttraining/orttraining/training_ops/cuda/math/div_grad.cc b/orttraining/orttraining/training_ops/cuda/math/div_grad.cc index 1f2d8abb50..75477dcc93 100644 --- a/orttraining/orttraining/training_ops/cuda/math/div_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/math/div_grad.cc @@ -67,6 +67,7 @@ Status DivGrad::ComputeInternal(OpKernelContext* context) const { switch (prepare.output_rank_or_simple_broadcast) { case static_cast(SimpleBroadcast::NoBroadcast): ImplDivGradSimple( + Stream(), SimpleBroadcast::NoBroadcast, prepare_a_data, prepare_b_data, @@ -84,6 +85,7 @@ Status DivGrad::ComputeInternal(OpKernelContext* context) const { } ImplDivGradSimple( + Stream(), SimpleBroadcast::LeftScalar, prepare_a_data, prepare_b_data, @@ -112,6 +114,7 @@ Status DivGrad::ComputeInternal(OpKernelContext* context) const { temp_db_data = temp_db_allocator.get(); } ImplDivGradSimple( + Stream(), SimpleBroadcast::RightScalar, prepare_a_data, prepare_b_data, @@ -143,6 +146,7 @@ Status DivGrad::ComputeInternal(OpKernelContext* context) const { if (prepare.output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::RightPerChannelBatch1)) { // lhs(1,C,H) and rhs (C,1) ImplDivGradRhsPerChannelBatch1( + Stream(), prepare_a_data, prepare_b_data, prepare_dy_data, @@ -153,6 +157,7 @@ Status DivGrad::ComputeInternal(OpKernelContext* context) const { } else { // lhs(N,C,H) and rhs (C,1) ImplDivGradRhsPerChannelBatchN( + Stream(), prepare_a_data, prepare_b_data, prepare_dy_data, @@ -197,6 +202,7 @@ Status DivGrad::ComputeInternal(OpKernelContext* context) const { } ImplDivGrad( + Stream(), prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, prepare_a_data, diff --git a/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu index 527f396093..1e64b1e110 100644 --- a/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu @@ -260,6 +260,7 @@ __global__ void _DivGrad_B( template void ImplDivGradSimple( + cudaStream_t stream, SimpleBroadcast simpleBroadcast, const T* a_data, const T* b_data, @@ -274,7 +275,7 @@ void ImplDivGradSimple( case SimpleBroadcast::NoBroadcast: // a, b and dy has the same shape: a_is_scalar = false, b_is_scalar = false if (da_output_data && db_output_data) - _DivGradSimple<<>>( + _DivGradSimple<<>>( a_data, b_data, dy_data, @@ -282,13 +283,13 @@ void ImplDivGradSimple( db_output_data, N); else if (da_output_data) - _DivGradSimple_A<<>>( + _DivGradSimple_A<<>>( b_data, dy_data, da_output_data, N); else - _DivGradSimple_B<<>>( + _DivGradSimple_B<<>>( a_data, b_data, dy_data, @@ -298,7 +299,7 @@ void ImplDivGradSimple( case SimpleBroadcast::LeftScalar: // a is a scalar, b and dy has the same shape if (da_output_data && db_output_data) - _DivGradSimple<<>>( + _DivGradSimple<<>>( a_data, b_data, dy_data, @@ -306,13 +307,13 @@ void ImplDivGradSimple( db_output_data, N); else if (da_output_data) - _DivGradSimple_A<<>>( + _DivGradSimple_A<<>>( b_data, dy_data, da_output_data, N); else - _DivGradSimple_B<<>>( + _DivGradSimple_B<<>>( a_data, b_data, dy_data, @@ -322,7 +323,7 @@ void ImplDivGradSimple( case SimpleBroadcast::RightScalar: // b is a scalar, a and dy has the same shape if (da_output_data && db_output_data) - _DivGradSimple<<>>( + _DivGradSimple<<>>( a_data, b_data, dy_data, @@ -330,13 +331,13 @@ void ImplDivGradSimple( db_output_data, N); else if (da_output_data) - _DivGradSimple_A<<>>( + _DivGradSimple_A<<>>( b_data, dy_data, da_output_data, N); else - _DivGradSimple_B<<>>( + _DivGradSimple_B<<>>( a_data, b_data, dy_data, @@ -350,6 +351,7 @@ void ImplDivGradSimple( template void ImplDivGradRhsPerChannelBatch1( + cudaStream_t stream, const T* a_data, const T* b_data, const T* dy_data, @@ -360,7 +362,7 @@ void ImplDivGradRhsPerChannelBatch1( int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); if (da_output_data && db_output_data) - _DivGradRhsPerChannelBatch1<<>>( + _DivGradRhsPerChannelBatch1<<>>( a_data, b_data, dy_data, @@ -369,14 +371,14 @@ void ImplDivGradRhsPerChannelBatch1( db_output_data, N); else if (da_output_data) - _DivGradRhsPerChannelBatch1_A<<>>( + _DivGradRhsPerChannelBatch1_A<<>>( b_data, dy_data, fdm_H, da_output_data, N); else - _DivGradRhsPerChannelBatch1_B<<>>( + _DivGradRhsPerChannelBatch1_B<<>>( a_data, b_data, dy_data, @@ -387,6 +389,7 @@ void ImplDivGradRhsPerChannelBatch1( template void ImplDivGradRhsPerChannelBatchN( + cudaStream_t stream, const T* a_data, const T* b_data, const T* dy_data, @@ -399,7 +402,7 @@ void ImplDivGradRhsPerChannelBatchN( CUDA_LONG N = static_cast(count); if (da_output_data && db_output_data) - _DivGradRhsPerChannelBatchN<<>>( + _DivGradRhsPerChannelBatchN<<>>( a_data, b_data, dy_data, @@ -409,7 +412,7 @@ void ImplDivGradRhsPerChannelBatchN( db_output_data, N); else if (da_output_data) - _DivGradRhsPerChannelBatchN_A<<>>( + _DivGradRhsPerChannelBatchN_A<<>>( b_data, dy_data, fdm_H, @@ -417,7 +420,7 @@ void ImplDivGradRhsPerChannelBatchN( da_output_data, N); else - _DivGradRhsPerChannelBatchN_B<<>>( + _DivGradRhsPerChannelBatchN_B<<>>( a_data, b_data, dy_data, @@ -429,6 +432,7 @@ void ImplDivGradRhsPerChannelBatchN( template void ImplDivGrad( + cudaStream_t stream, int32_t output_rank, const TArray* a_padded_strides, const T* a_data, @@ -443,7 +447,7 @@ void ImplDivGrad( CUDA_LONG N = static_cast(count); if (a_padded_strides && a_padded_strides->Size() && b_padded_strides && b_padded_strides->Size()) { if (da_output_data && db_output_data) - _DivGrad<<>>( + _DivGrad<<>>( output_rank, *a_padded_strides, a_data, @@ -455,7 +459,7 @@ void ImplDivGrad( db_output_data, N); else if (da_output_data) - _DivGrad_A<<>>( + _DivGrad_A<<>>( output_rank, *b_padded_strides, b_data, @@ -464,7 +468,7 @@ void ImplDivGrad( da_output_data, N); else - _DivGrad_B<<>>( + _DivGrad_B<<>>( output_rank, *a_padded_strides, a_data, @@ -476,7 +480,7 @@ void ImplDivGrad( N); } else if (a_padded_strides && a_padded_strides->Size()) { if (da_output_data && db_output_data) - _DivGrad<<>>( + _DivGrad<<>>( output_rank, *a_padded_strides, a_data, @@ -488,7 +492,7 @@ void ImplDivGrad( db_output_data, N); else if (da_output_data) - _DivGrad_A<<>>( + _DivGrad_A<<>>( output_rank, *b_padded_strides, b_data, @@ -497,7 +501,7 @@ void ImplDivGrad( da_output_data, N); else - _DivGrad_B<<>>( + _DivGrad_B<<>>( output_rank, *a_padded_strides, a_data, @@ -509,7 +513,7 @@ void ImplDivGrad( N); } else { if (da_output_data && db_output_data) - _DivGrad<<>>( + _DivGrad<<>>( output_rank, *a_padded_strides, a_data, @@ -521,7 +525,7 @@ void ImplDivGrad( db_output_data, N); else if (da_output_data) - _DivGrad_A<<>>( + _DivGrad_A<<>>( output_rank, *b_padded_strides, b_data, @@ -530,7 +534,7 @@ void ImplDivGrad( da_output_data, N); else - _DivGrad_B<<>>( + _DivGrad_B<<>>( output_rank, *a_padded_strides, a_data, @@ -545,6 +549,7 @@ void ImplDivGrad( #define SPECIALIZED_DIV_GRAD_IMPL(T) \ template void ImplDivGrad( \ + cudaStream_t stream, \ int32_t output_rank, \ const TArray* a_padded_strides, \ const T* a_data, \ @@ -556,6 +561,7 @@ void ImplDivGrad( T* da_output_data, \ T* db_output_data); \ template void ImplDivGradRhsPerChannelBatch1( \ + cudaStream_t stream, \ const T* a_data, \ const T* b_data, \ const T* dy_data, \ @@ -564,6 +570,7 @@ void ImplDivGrad( T* da_output_data, \ T* db_output_data); \ template void ImplDivGradRhsPerChannelBatchN( \ + cudaStream_t stream, \ const T* a_data, \ const T* b_data, \ const T* dy_data, \ @@ -573,6 +580,7 @@ void ImplDivGrad( T* da_output_data, \ T* db_output_data); \ template void ImplDivGradSimple( \ + cudaStream_t stream, \ SimpleBroadcast simpleBroadcast, \ const T* a_data, \ const T* b_data, \ diff --git a/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.h b/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.h index 68a59ca06a..947535277f 100644 --- a/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.h +++ b/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.h @@ -9,6 +9,7 @@ namespace onnxruntime { namespace cuda { template void ImplDivGradSimple( + cudaStream_t stream, SimpleBroadcast simpleBroadcast, const T* a_data, const T* b_data, @@ -19,6 +20,7 @@ void ImplDivGradSimple( template void ImplDivGradRhsPerChannelBatch1( + cudaStream_t stream, const T* a_data, const T* b_data, const T* dy_data, @@ -29,6 +31,7 @@ void ImplDivGradRhsPerChannelBatch1( template void ImplDivGradRhsPerChannelBatchN( + cudaStream_t stream, const T* a_data, const T* b_data, const T* dy_data, @@ -40,6 +43,7 @@ void ImplDivGradRhsPerChannelBatchN( template void ImplDivGrad( + cudaStream_t stream, int32_t output_rank, const TArray* a_padded_strides, const T* a_data, diff --git a/orttraining/orttraining/training_ops/cuda/math/isfinite.cc b/orttraining/orttraining/training_ops/cuda/math/isfinite.cc index d42dd7372f..07ba929cc2 100644 --- a/orttraining/orttraining/training_ops/cuda/math/isfinite.cc +++ b/orttraining/orttraining/training_ops/cuda/math/isfinite.cc @@ -26,6 +26,7 @@ Status IsFiniteOp::ComputeInternal(OpKernelContext* context) const { const Tensor& input = *context->Input(0); Tensor& output = *context->Output(0, input.Shape()); IsFinite( + Stream(), reinterpret_cast(input.Data()), output.MutableData(), input.Shape().Size()); @@ -59,7 +60,7 @@ Status IsAllFiniteOp::ComputeInternal(OpKernelContext* context) const { // to false if any value in any tensor is non-finite. Tensor& output = *context->Output(0, {}); auto* output_data = reinterpret_cast::MappedType*>(output.template MutableData()); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output_data, int(true), sizeof(bool))); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output_data, int(true), sizeof(bool), Stream())); std::vector> grouped_tensor_pointers(total_tensor_count); std::vector tensor_sizes(total_tensor_count); @@ -76,7 +77,7 @@ Status IsAllFiniteOp::ComputeInternal(OpKernelContext* context) const { // Check if all values are finite and write true to output. // Otherwise, false will be written. launch_multi_tensor_functor<1, TFunctor>( - 2048 * 32, tensor_sizes, grouped_tensor_pointers, functor, output_data); + Stream(), 2048 * 32, tensor_sizes, grouped_tensor_pointers, functor, output_data); return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/cuda/math/isfinite.cu b/orttraining/orttraining/training_ops/cuda/math/isfinite.cu index 5b85a7099e..95fd7d1a4e 100644 --- a/orttraining/orttraining/training_ops/cuda/math/isfinite.cu +++ b/orttraining/orttraining/training_ops/cuda/math/isfinite.cu @@ -15,14 +15,14 @@ __global__ void _IsFinite(const TSrc* input, bool* output, CUDA_LONG N) { } template -void IsFinite(const TSrc* input, bool* output, size_t count) { +void IsFinite(cudaStream_t stream, const TSrc* input, bool* output, size_t count) { int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); - _IsFinite<<>>(input, output, N); + _IsFinite<<>>(input, output, N); } #define SPECIALIZE_ISFINITE_IMPL(T) \ -template void IsFinite(const T* input, bool* output, size_t count); +template void IsFinite(cudaStream_t stream, const T* input, bool* output, size_t count); SPECIALIZE_ISFINITE_IMPL(half) SPECIALIZE_ISFINITE_IMPL(float) @@ -53,14 +53,14 @@ __global__ void IsAllFiniteMultiTensorImpl(ChunkGroup<1> chunks, bool* output) { } template -void IsAllFiniteFunctor::operator()(ChunkGroup<1> chunks, bool* output) { +void IsAllFiniteFunctor::operator()(cudaStream_t stream, ChunkGroup<1> chunks, bool* output) { const int block_count = chunks.chunk_count; const int thread_count = ChunkGroup<1>::thread_count_per_block; - IsAllFiniteMultiTensorImpl<<>>(chunks, output); + IsAllFiniteMultiTensorImpl<<>>(chunks, output); } #define INSTANTIATE_ISALLFINITE_FUNCTOR(T) \ - template void IsAllFiniteFunctor::operator()(ChunkGroup<1> chunks, bool* output); + template void IsAllFiniteFunctor::operator()(cudaStream_t stream, ChunkGroup<1> chunks, bool* output); INSTANTIATE_ISALLFINITE_FUNCTOR(half) INSTANTIATE_ISALLFINITE_FUNCTOR(float) diff --git a/orttraining/orttraining/training_ops/cuda/math/isfinite.h b/orttraining/orttraining/training_ops/cuda/math/isfinite.h index 45aaa070da..44e9a7a50a 100644 --- a/orttraining/orttraining/training_ops/cuda/math/isfinite.h +++ b/orttraining/orttraining/training_ops/cuda/math/isfinite.h @@ -19,7 +19,7 @@ class IsFiniteOp final : public CudaKernel { }; template -void IsFinite(const TSrc* input, bool* output, size_t N); +void IsFinite(cudaStream_t stream, const TSrc* input, bool* output, size_t N); template class IsAllFiniteOp final : public CudaKernel { @@ -32,7 +32,7 @@ class IsAllFiniteOp final : public CudaKernel { template struct IsAllFiniteFunctor { - void operator()(ChunkGroup<1> chunks, bool* output); + void operator()(cudaStream_t stream, ChunkGroup<1> chunks, bool* output); }; } // namespace cuda diff --git a/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.cc b/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.cc index 88f800b96b..1ab0a00307 100644 --- a/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.cc +++ b/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.cc @@ -102,6 +102,7 @@ Status MixedPrecisionScale::ComputeInternal(OpKernelContext* context) cons #define CASE(TP_TYPE, DstT) \ case TP_TYPE: \ Impl_MixedPrecisionScale::MappedType>( \ + Stream(), \ x_data, \ scale_data, \ reinterpret_cast::MappedType*>(y_data), \ diff --git a/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.cu b/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.cu index a4c46b12aa..b86641c091 100644 --- a/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.cu +++ b/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.cu @@ -24,13 +24,14 @@ __global__ void _MixedPrecisionScale( template void Impl_MixedPrecisionScale( + cudaStream_t stream, const SrcT* input_data, const float* scale_data, DstT* output_data, size_t count){ int blocksPerGrid = static_cast(CeilDiv(count, GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); - _MixedPrecisionScale<<>>( + _MixedPrecisionScale<<>>( input_data, scale_data, output_data, @@ -39,6 +40,7 @@ void Impl_MixedPrecisionScale( #define SPECIALIZE_MIXEDPRECISIONSCALE_IMPL(SrcT, DstT) \ template void Impl_MixedPrecisionScale( \ + cudaStream_t stream, \ const SrcT* input_data, \ const float* scale_data, \ DstT* output_data, \ diff --git a/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.h b/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.h index f63f5431ae..b5400cc30e 100644 --- a/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.h +++ b/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.h @@ -10,6 +10,7 @@ namespace cuda { template void Impl_MixedPrecisionScale( + cudaStream_t stream, const SrcT* input_data, const float* scale_data, DstT* output_data, diff --git a/orttraining/orttraining/training_ops/cuda/math/scale.cc b/orttraining/orttraining/training_ops/cuda/math/scale.cc index a525cd8df2..7fe37d00f7 100644 --- a/orttraining/orttraining/training_ops/cuda/math/scale.cc +++ b/orttraining/orttraining/training_ops/cuda/math/scale.cc @@ -47,6 +47,7 @@ Status Scale::ComputeInternal(OpKernelContext* context) const { auto lhs_tensor = context->Input(0); auto output_tensor = context->Output(0, lhs_tensor->Shape()); Impl_Scale( + Stream(), reinterpret_cast(lhs_tensor->template Data()), scale_value, reinterpret_cast(output_tensor->template MutableData()), diff --git a/orttraining/orttraining/training_ops/cuda/math/scale.cu b/orttraining/orttraining/training_ops/cuda/math/scale.cu index 7d9cce529b..b132665039 100644 --- a/orttraining/orttraining/training_ops/cuda/math/scale.cu +++ b/orttraining/orttraining/training_ops/cuda/math/scale.cu @@ -36,13 +36,14 @@ __global__ void _Scale( template void Impl_Scale( + cudaStream_t stream, const T* input_data, const float scale_value, T* output_data, size_t count) { int blocksPerGrid = static_cast(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); CUDA_LONG N = static_cast(count); - _Scale<<>>( + _Scale<<>>( input_data, static_cast(scale_value), output_data, @@ -51,6 +52,7 @@ void Impl_Scale( #define SPECIALIZE_SCALE_IMPL(T) \ template void Impl_Scale( \ + cudaStream_t stream, \ const T* input_data, \ const float scale_value, \ T* output_data, \ diff --git a/orttraining/orttraining/training_ops/cuda/math/scale.h b/orttraining/orttraining/training_ops/cuda/math/scale.h index b0ecd26962..020f4efbdb 100644 --- a/orttraining/orttraining/training_ops/cuda/math/scale.h +++ b/orttraining/orttraining/training_ops/cuda/math/scale.h @@ -18,6 +18,7 @@ struct GetScaleValueImpl { template void Impl_Scale( + cudaStream_t stream, const T* input_data, const float scale_value, T* output_data, diff --git a/orttraining/orttraining/training_ops/cuda/math/softmax_grad.cc b/orttraining/orttraining/training_ops/cuda/math/softmax_grad.cc index 270397c4d9..74976d0639 100644 --- a/orttraining/orttraining/training_ops/cuda/math/softmax_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/math/softmax_grad.cc @@ -13,6 +13,7 @@ namespace cuda { template Status SoftMaxGradComputeHelper( + cudaStream_t stream, const T* dY, const TensorShape& input_shape, const T* Y, @@ -33,7 +34,7 @@ Status SoftMaxGradComputeHelper( if (D <= 1024 && D * sizeof(T) <= 4096) { dispatch_softmax_backward, is_log_softmax>( - dX_data, dY_data, Y_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); + stream, dX_data, dY_data, Y_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); return Status::OK(); } @@ -65,6 +66,7 @@ Status SoftMaxGradComputeHelper( #define SPECIALIZED_SOFTMAXGRAD_HELPER_IMPL_BFloat16(is_log_softmax) \ template <> \ Status SoftMaxGradComputeHelper( \ + cudaStream_t stream, \ const BFloat16* dY, \ const TensorShape& input_shape, \ const BFloat16* Y, \ @@ -79,7 +81,7 @@ Status SoftMaxGradComputeHelper( auto Y_data = reinterpret_cast(Y); \ auto dX_data = reinterpret_cast(dX); \ dispatch_softmax_backward, is_log_softmax>( \ - dX_data, dY_data, Y_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); \ + stream, dX_data, dY_data, Y_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); \ return Status::OK(); \ } @@ -117,9 +119,9 @@ Status SoftmaxGrad::ComputeInternal(OpKernelContext* ctx) const { T* dX_data = dX->template MutableData(); if (log_softmax_) { - return SoftMaxGradComputeHelper(dY_data, input_shape, Y_data, dX_data, CudnnHandle(), axis_); + return SoftMaxGradComputeHelper(Stream(), dY_data, input_shape, Y_data, dX_data, CudnnHandle(), axis_); } else { - return SoftMaxGradComputeHelper(dY_data, input_shape, Y_data, dX_data, CudnnHandle(), axis_); + return SoftMaxGradComputeHelper(Stream(), dY_data, input_shape, Y_data, dX_data, CudnnHandle(), axis_); } } diff --git a/orttraining/orttraining/training_ops/cuda/math/softmax_grad.h b/orttraining/orttraining/training_ops/cuda/math/softmax_grad.h index 31e396b0c0..4e50cf2cf4 100644 --- a/orttraining/orttraining/training_ops/cuda/math/softmax_grad.h +++ b/orttraining/orttraining/training_ops/cuda/math/softmax_grad.h @@ -9,7 +9,7 @@ namespace onnxruntime { namespace cuda { template -void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const input_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); +void dispatch_softmax_backward(cudaStream_t stream, output_t* grad_input, const input_t* grad, const input_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); template class SoftmaxGrad final : public CudaKernel { diff --git a/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu index 3b1bf2e508..f3e2fe4d39 100644 --- a/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu @@ -121,7 +121,7 @@ __global__ void softmax_warp_backward(output_t* gradInput, const input_t* grad, } template -void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const input_t* output, int softmax_elements, int softmax_elements_stride, int batch_count) { +void dispatch_softmax_backward(cudaStream_t stream, output_t* grad_input, const input_t* grad, const input_t* output, int softmax_elements, int softmax_elements_stride, int batch_count) { if (softmax_elements == 0) { return; } else { @@ -145,47 +145,47 @@ void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const switch (log2_elements) { case 0: // 1 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 1: // 2 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 2: // 4 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 3: // 8 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 4: // 16 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 5: // 32 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 6: // 64 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 7: // 128 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 8: // 256 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 9: // 512 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 10: // 1024 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; default: break; @@ -194,8 +194,8 @@ void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const } #define SPECIALIZED_SOFTMAX_GRAD_IMPL(input_t, output_t, acc_t) \ -template void dispatch_softmax_backward(input_t * grad_input, const output_t* grad, const output_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); \ -template void dispatch_softmax_backward(input_t * grad_input, const output_t* grad, const output_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); +template void dispatch_softmax_backward(cudaStream_t stream, input_t * grad_input, const output_t* grad, const output_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); \ +template void dispatch_softmax_backward(cudaStream_t stream, input_t * grad_input, const output_t* grad, const output_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); SPECIALIZED_SOFTMAX_GRAD_IMPL(float, float, float) SPECIALIZED_SOFTMAX_GRAD_IMPL(half, half, float) diff --git a/orttraining/orttraining/training_ops/cuda/nn/dropout.cc b/orttraining/orttraining/training_ops/cuda/nn/dropout.cc index 688223c336..b67eb70426 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/dropout.cc +++ b/orttraining/orttraining/training_ops/cuda/nn/dropout.cc @@ -37,7 +37,8 @@ REGISTER_GRADIENT_KERNEL(DropoutGrad) template struct DropoutGradComputeImpl { - void operator()(const int64_t N, + void operator()(cudaStream_t stream, + const int64_t N, const Tensor& dY, const bool* mask_data, const float ratio_data, @@ -46,7 +47,7 @@ struct DropoutGradComputeImpl { const CudaT* dY_data = reinterpret_cast(dY.template Data()); CudaT* dX_data = reinterpret_cast(dX.template MutableData()); - DropoutGradientKernelImpl(N, dY_data, mask_data, ratio_data, dX_data); + DropoutGradientKernelImpl(stream, N, dY_data, mask_data, ratio_data, dX_data); } }; @@ -79,7 +80,7 @@ Status DropoutGrad::ComputeInternal(OpKernelContext* context) const { auto dX = context->Output(0, shape); utils::MLTypeCallDispatcher t_disp(dY->GetElementType()); - t_disp.Invoke(N, *dY, mask_data, ratio_data, *dX); + t_disp.Invoke(Stream(), N, *dY, mask_data, ratio_data, *dX); return Status::OK(); } @@ -100,6 +101,7 @@ ONNX_OPERATOR_KERNEL_EX( template struct BiasDropoutComputeImpl { Status operator()(const cudaDeviceProp& prop, + cudaStream_t stream, const int64_t N, const fast_divmod fdm_dim, const float ratio_data, @@ -124,7 +126,7 @@ struct BiasDropoutComputeImpl { CudaT* Y_data = reinterpret_cast(Y.template MutableData()); - BiasDropoutKernelImpl(prop, N, fdm_dim, ratio_data, generator, X_data, bias_data, residual_data, Y_data, mask_data); + BiasDropoutKernelImpl(prop, stream, N, fdm_dim, ratio_data, generator, X_data, bias_data, residual_data, Y_data, mask_data); return Status::OK(); } @@ -185,7 +187,7 @@ Status BiasDropout::ComputeInternal(OpKernelContext* context) const { PhiloxGenerator& generator = generator_ ? *generator_ : PhiloxGenerator::Default(); utils::MLTypeCallDispatcherRet t_disp(X->GetElementType()); - return t_disp.Invoke(GetDeviceProp(), N, fdm_dim, ratio_data, generator, *X, *bias, residual, *Y, mask_data); + return t_disp.Invoke(GetDeviceProp(), Stream(), N, fdm_dim, ratio_data, generator, *X, *bias, residual, *Y, mask_data); } } // namespace cuda diff --git a/orttraining/orttraining/training_ops/cuda/nn/dropout_impl.cu b/orttraining/orttraining/training_ops/cuda/nn/dropout_impl.cu index b7960c5151..eed291e1d7 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/dropout_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/nn/dropout_impl.cu @@ -43,6 +43,7 @@ __global__ void DropoutGradientKernel( template void DropoutGradientKernelImpl( + cudaStream_t stream, const int64_t N, const T* dY_data, const bool* mask_data, @@ -50,18 +51,19 @@ void DropoutGradientKernelImpl( T* dX_data) { if (ratio == 0.0f) { if (dY_data != dX_data) { - CUDA_CALL_THROW(cudaMemcpyAsync(dX_data, dY_data, N * sizeof(T), cudaMemcpyDeviceToDevice)); + CUDA_CALL_THROW(cudaMemcpyAsync(dX_data, dY_data, N * sizeof(T), cudaMemcpyDeviceToDevice, stream)); } } else { const float scale = 1.f / (1.f - ratio); const int blocksPerGrid = static_cast(CeilDiv(N, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); DropoutGradientKernel - <<>>(N, dY_data, mask_data, scale, dX_data); + <<>>(N, dY_data, mask_data, scale, dX_data); } } #define SPECIALIZED_DROPOUT_GRAD_IMPL(T) \ template void DropoutGradientKernelImpl( \ + cudaStream_t stream, \ const int64_t N, \ const T* dY_data, \ const bool* mask_data, \ @@ -131,6 +133,7 @@ __global__ void BiasDropoutKernel( template void BiasDropoutKernelImpl( const cudaDeviceProp& prop, + cudaStream_t stream, const int64_t N, const fast_divmod fdm_dim, const float ratio, @@ -149,15 +152,16 @@ void BiasDropoutKernelImpl( auto seeds = generator.NextPhiloxSeeds(counter_offset); if (residual_data == nullptr) { - BiasDropoutKernel<<>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data); + BiasDropoutKernel<<>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data); } else { - BiasDropoutKernel<<>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data); + BiasDropoutKernel<<>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data); } } #define SPECIALIZED_BIAS_DROPOUT_IMPL(T) \ template void BiasDropoutKernelImpl( \ const cudaDeviceProp& prop, \ + cudaStream_t stream, \ const int64_t N, \ const fast_divmod fdm_dim, \ const float ratio, \ diff --git a/orttraining/orttraining/training_ops/cuda/nn/dropout_impl.h b/orttraining/orttraining/training_ops/cuda/nn/dropout_impl.h index 09444662af..8dbf3f9655 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/dropout_impl.h +++ b/orttraining/orttraining/training_ops/cuda/nn/dropout_impl.h @@ -10,6 +10,7 @@ namespace cuda { template void DropoutGradientKernelImpl( + cudaStream_t stream, const int64_t N, const T* dY_data, const bool* mask_data, @@ -19,6 +20,7 @@ void DropoutGradientKernelImpl( template void BiasDropoutKernelImpl( const cudaDeviceProp& prop, + cudaStream_t stream, const int64_t N, const fast_divmod fdm_dim, const float ratio, diff --git a/orttraining/orttraining/training_ops/cuda/nn/layer_norm.cc b/orttraining/orttraining/training_ops/cuda/nn/layer_norm.cc index 3695c029c8..64262a1ed1 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/layer_norm.cc +++ b/orttraining/orttraining/training_ops/cuda/nn/layer_norm.cc @@ -95,7 +95,7 @@ Status LayerNormGrad::ComputeInternal(OpKernelContext* p_op_ke auto part_grad_gamma = GetScratchBuffer(part_size * n2); auto part_grad_beta = GetScratchBuffer(part_size * n2); - HostLayerNormGradient(GetDeviceProp(), Y_grad_data, X_data, reinterpret_cast(NULL), + HostLayerNormGradient(GetDeviceProp(), Stream(), Y_grad_data, X_data, reinterpret_cast(NULL), scale_data, reinterpret_cast(NULL), mean_data, inv_std_var_data, n1, n2, X_grad_data, scale_grad_data, bias_grad_data, part_grad_gamma.get(), part_grad_beta.get(), part_size); @@ -144,7 +144,7 @@ Status InvertibleLayerNormGrad::ComputeInternal(OpKernelContext* p_op_kern auto part_grad_gamma = GetScratchBuffer(part_size * n2); auto part_grad_beta = GetScratchBuffer(part_size * n2); - HostLayerNormGradient(GetDeviceProp(), Y_grad_data, reinterpret_cast(NULL), Y_data, + HostLayerNormGradient(GetDeviceProp(), Stream(), Y_grad_data, reinterpret_cast(NULL), Y_data, scale_data, bias_data, reinterpret_cast(NULL), inv_std_var_data, n1, n2, X_grad_data, scale_grad_data, bias_grad_data, part_grad_gamma.get(), part_grad_beta.get(), part_size); diff --git a/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu b/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu index 00bdc2d525..99f818ff56 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu @@ -439,6 +439,7 @@ __global__ void cuComputeGradInput( template void HostLayerNormGradient( const cudaDeviceProp& prop, + cudaStream_t stream, const T* dout, const T* input, const T* output, @@ -464,7 +465,7 @@ void HostLayerNormGradient( const int nshared2 = nshared2_a > nshared2_b ? nshared2_a : nshared2_b; if (mean == nullptr && !simplified) { // use_mean == false, simplified == false -> Inverted Layer Norm - cuComputePartGradGammaBeta<<>>( + cuComputePartGradGammaBeta<<>>( dout, input, output, @@ -478,7 +479,7 @@ void HostLayerNormGradient( } else { // use_mean == true, simplified == false -> Layer Norm // use_mean == true, simplified == true -> Simplified Layer Norm - cuComputePartGradGammaBeta<<>>( + cuComputePartGradGammaBeta<<>>( dout, input, output, @@ -493,7 +494,7 @@ void HostLayerNormGradient( const dim3 threads3(warp_size, 8, 1); const dim3 blocks3((n2 + threads2.x - 1) / threads2.x, 1, 1); const int nshared3 = threads3.x * threads3.y * sizeof(U); - cuComputeGradGammaBeta<<>>( + cuComputeGradGammaBeta<<>>( part_grad_gamma, part_grad_beta, part_size, @@ -507,7 +508,7 @@ void HostLayerNormGradient( int nshared = threads1.y > 1 ? threads1.y * threads1.x * sizeof(U) : 0; if (mean == nullptr && !simplified) { - cuComputeGradInput<<>>( + cuComputeGradInput<<>>( dout, input, output, @@ -518,7 +519,7 @@ void HostLayerNormGradient( n1, n2, grad_input); } else { - cuComputeGradInput<<>>( + cuComputeGradInput<<>>( dout, input, output, @@ -532,7 +533,7 @@ void HostLayerNormGradient( } #define LAYERNORMGRAD_IMPL(T, U, simplified) \ - template void HostLayerNormGradient(const cudaDeviceProp& prop, const T* dout, const T* input, const T* output, \ + template void HostLayerNormGradient(const cudaDeviceProp& prop, cudaStream_t stream, const T* dout, const T* input, const T* output, \ const T* gamma, const T* beta, const U* mean, const U* invvar, int64_t n1, int64_t n2, \ T* grad_input, T* grad_gamma, T* grad_beta, U* part_grad_gamma, U* part_grad_beta, const int part_size); diff --git a/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.h b/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.h index 4722ab6126..a8d5e4e9d6 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.h +++ b/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.h @@ -31,6 +31,7 @@ namespace cuda { template void HostLayerNormGradient( const cudaDeviceProp& prop, + cudaStream_t stream, const T* dout, const T* input, const T* output, diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/adam.cc b/orttraining/orttraining/training_ops/cuda/optimizer/adam.cc index 3045a549b3..03fdcc3685 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/adam.cc +++ b/orttraining/orttraining/training_ops/cuda/optimizer/adam.cc @@ -115,20 +115,20 @@ Status AdamOptimizer: if (do_update_tensor != nullptr) { const bool do_update = *(do_update_tensor->template Data()); if (!do_update) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(M1, NM1)); - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(M2, NM2)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), M1, NM1)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), M2, NM2)); if (S_in != S_out) { *(S_out) = *(S_in); } if (NW != nullptr) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(W, *NW)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), W, *NW)); } if (NG != nullptr) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(G, *NG)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), G, *NG)); } if (W_MIXED_FP != nullptr && NW_MIXED_FP != nullptr) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(*W_MIXED_FP, *NW_MIXED_FP)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), *W_MIXED_FP, *NW_MIXED_FP)); } return Status::OK(); @@ -136,6 +136,7 @@ Status AdamOptimizer: } AdamOptimizerImpl( + Stream(), reinterpret_cast(ETA.template Data()), *S_in, reinterpret_cast(W.template Data()), diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/adam.cu b/orttraining/orttraining/training_ops/cuda/optimizer/adam.cu index d892cd446d..b054485c46 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/adam.cu +++ b/orttraining/orttraining/training_ops/cuda/optimizer/adam.cu @@ -139,6 +139,7 @@ __global__ void _AdamOptimizer_mode1( template void AdamOptimizerImpl( + cudaStream_t stream, const T1* eta, const T2 update_count, const T3* weights, @@ -176,7 +177,7 @@ void AdamOptimizerImpl( // bias correction is applied on learning rate, // weight decay is applied after weight is updated. if (weight_decay_mode == 0) { - _AdamOptimizer_mode0<<>>( + _AdamOptimizer_mode0<<>>( eta, weights, grads, @@ -200,7 +201,7 @@ void AdamOptimizerImpl( N); } else if (weight_decay_mode == 1) { - _AdamOptimizer_mode1<<>>( + _AdamOptimizer_mode1<<>>( eta, weights, grads, @@ -230,6 +231,7 @@ void AdamOptimizerImpl( #define SPECIALIZED_AdamOptimizerImpl(T1, T2, T3, T4, T_GRAD, T_GRAD_NORM, T_MIXED_PRECISION_FP) \ template void AdamOptimizerImpl( \ + cudaStream_t stream, \ const T1* eta, \ const T2 update_count, \ const T3* weights, \ diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/adam.h b/orttraining/orttraining/training_ops/cuda/optimizer/adam.h index f979056e38..4ebb6e41c8 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/adam.h +++ b/orttraining/orttraining/training_ops/cuda/optimizer/adam.h @@ -10,6 +10,7 @@ namespace cuda { template void AdamOptimizerImpl( + cudaStream_t stream, const T1* eta, const T2 update_count, const T3* weights, diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/common.h b/orttraining/orttraining/training_ops/cuda/optimizer/common.h index 72ebf81f91..9cd86b42b3 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/common.h +++ b/orttraining/orttraining/training_ops/cuda/optimizer/common.h @@ -9,11 +9,11 @@ namespace onnxruntime { namespace cuda { template -Status CopyIfNotSameBuffer(const Tensor& source_tensor, Tensor& target_tensor) { +Status CopyIfNotSameBuffer(cudaStream_t stream, const Tensor& source_tensor, Tensor& target_tensor) { const T* source = source_tensor.template Data(); T* target = target_tensor.template MutableData(); if (target != source) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target, source, source_tensor.SizeInBytes(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target, source, source_tensor.SizeInBytes(), cudaMemcpyDeviceToDevice, stream)); } return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cc b/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cc index 80d9b442fd..a8c4de6f5f 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cc +++ b/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cc @@ -42,7 +42,7 @@ Status ZeroGradient::ComputeInternal(OpKernelContext* ctx) const { CUDA_RETURN_IF_ERROR(cudaMemsetAsync( zero_gradient.template MutableData(), 0, - zero_gradient.Shape().Size() * sizeof(T))); + zero_gradient.Shape().Size() * sizeof(T), Stream())); return Status::OK(); } @@ -75,12 +75,13 @@ Status InPlaceAccumulator::ComputeInternal(OpKernelContext* ctx) cons if (do_update_tensor) { const bool do_update = *(do_update_tensor->template Data()); if (!do_update) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(left_addee_buffer, accumulation_output)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), left_addee_buffer, accumulation_output)); return Status::OK(); } } InPlaceAccumulatorImpl( + Stream(), reinterpret_cast(left_addee_buffer.template Data()), reinterpret_cast(right_addee_buffer.template Data()), reinterpret_cast(accumulation_output.template MutableData()), diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cu b/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cu index b6c49a5acb..1d83bb166c 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cu +++ b/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cu @@ -21,13 +21,14 @@ __global__ void _InPlaceAccumulator( template void InPlaceAccumulatorImpl( + cudaStream_t stream, const T* gradient_buffer, const T_GRAD* gradient, T* accumulated_gradient, size_t count) { int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); - _InPlaceAccumulator<<>>( + _InPlaceAccumulator<<>>( gradient_buffer, gradient, accumulated_gradient, @@ -36,6 +37,7 @@ void InPlaceAccumulatorImpl( #define SPECIALIZED_IMPL_InPlaceAccumulator(T, T_GRAD) \ template void InPlaceAccumulatorImpl( \ + cudaStream_t stream, \ const T* gradient_buffer, \ const T_GRAD* gradient, \ T* accumulated_gradient, \ diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.h b/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.h index 7f54d8bbce..c2a4f8e234 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.h +++ b/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.h @@ -25,6 +25,7 @@ class InPlaceAccumulator final : public CudaKernel { // Implementation can be found in cuda file, optimizers_impl.cu template void InPlaceAccumulatorImpl( + cudaStream_t stream, const T* gradient_buffer, const T_GRAD* gradient, T* accumulated_gradient, diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc index e27903e89d..ded60d2373 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc +++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc @@ -117,6 +117,7 @@ void check_inputs_and_outputs( template Status copy_inputs_to_outputs( + cudaStream_t stream, OpKernelContext* ctx, const int non_grouped_input_count, const int non_grouped_output_count, @@ -155,16 +156,16 @@ Status copy_inputs_to_outputs( w_mixed_precision_new->SetByteOffset(w_mixed_precision->ByteOffset()); if (w_new) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(w, *w_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, w, *w_new)); } if (g_new) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(g, *g_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, g, *g_new)); } - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(m1, m1_new)); - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(m2, m2_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, m1, m1_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, m2, m2_new)); if (w_mixed_precision_new) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(*w_mixed_precision, *w_mixed_precision_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, *w_mixed_precision, *w_mixed_precision_new)); } } @@ -173,6 +174,7 @@ Status copy_inputs_to_outputs( template Status launch_lamb_compute_direction( + cudaStream_t stream, const int64_t update_count, const int group_count, const CudaT2* p_loss_scale, @@ -221,6 +223,7 @@ Status launch_lamb_compute_direction( do_bias_correction ? onnxruntime::contrib::compute_bias_correction_coefficient(betas[i], update_count) : 1.f; LambComputeDirection( + stream, p_ws[i], p_gs[i], p_m1s[i], @@ -268,6 +271,7 @@ Status launch_lamb_compute_direction( LambStage1 lamb_stage1; launch_multi_tensor_functor( + stream, 2048 * 32, tensor_sizes_in_buckets[key], buckets[key], @@ -299,6 +303,7 @@ Status launch_lamb_reduction( constexpr int tensor_count_per_group = 4; + cudaStream_t stream = kernel.Stream(); // Bucketize tensor groups by the associated optimizer configuration. // If two tensor groups use different "alpha", they should be put into two distinct buckets. std::vector> buckets; @@ -307,12 +312,14 @@ Status launch_lamb_reduction( for (int i = 0; i < group_count; ++i) { if (tensor_sizes[i] > max_tensor_size) { ORT_RETURN_IF_ERROR(reduce_square_sum( + stream, p_ws[i], p_w_norms[i], tensor_sizes[i], reduction_buffer, reduction_buffer_size)); ORT_RETURN_IF_ERROR(reduce_square_sum( + stream, p_ds[i], p_d_norms[i], tensor_sizes[i], @@ -343,6 +350,7 @@ Status launch_lamb_reduction( typedef LambMultiTensorReductionFunctor TReducer; TReducer reducer; launch_multi_tensor_functor( + stream, 2048 * 32, tensor_sizes_in_buckets, buckets, @@ -357,6 +365,7 @@ Status launch_lamb_reduction( template Status launch_lamb_update( + cudaStream_t stream, const int group_count, const CudaT1* eta, const float ratio_min, @@ -389,6 +398,7 @@ Status launch_lamb_update( for (int i = 0; i < group_count; ++i) { if (tensor_sizes[i] > max_tensor_size) { LambUpdate( + stream, eta, ratio_min, ratio_max, @@ -430,6 +440,7 @@ Status launch_lamb_update( LambStage2 lamb_stage2; launch_multi_tensor_functor( + stream, 2048 * 32, tensor_sizes_in_bucket, buckets, @@ -503,6 +514,7 @@ Status LambOptimizer::Compute auto update_signal = *update_signal_tensor->template Data(); if (!update_signal) { return copy_inputs_to_outputs( + Stream(), ctx, non_grouped_input_count, non_grouped_output_count, @@ -539,14 +551,14 @@ Status LambOptimizer::Compute // and T2=float. IAllocatorUniquePtr d_norm_buffer = GetScratchBuffer(group_count); CudaT2* d_norm_data = reinterpret_cast(d_norm_buffer.get()); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(d_norm_data, 0, group_count * sizeof(T2))); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(d_norm_data, 0, group_count * sizeof(T2), Stream())); // Allocate buffer for reduction computation of weight tensor. // The i-th weight's norm is stored at the i-th element. // We reduce type T2 tensor to type T2 scalar. An example is that T2=float. IAllocatorUniquePtr w_norm_buffer = GetScratchBuffer(group_count); CudaT2* w_norm_data = reinterpret_cast(w_norm_buffer.get()); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(w_norm_data, 0, group_count * sizeof(T2))); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(w_norm_data, 0, group_count * sizeof(T2), Stream())); // Find the max size of updated weight tensors. int max_tensor_size = 0; @@ -652,6 +664,7 @@ Status LambOptimizer::Compute } ORT_RETURN_IF_ERROR(launch_lamb_compute_direction( + Stream(), step_data ? *step_data : 0, group_count, loss_scale_data, @@ -675,6 +688,7 @@ Status LambOptimizer::Compute reduction_buffer_size)); ORT_RETURN_IF_ERROR(launch_lamb_update( + Stream(), group_count, eta_data, ratio_min_, diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu index b8c8171509..5ebc2fff49 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu +++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu @@ -110,6 +110,7 @@ __global__ void _LambComputeDirectionImpl( template void LambComputeDirection( + cudaStream_t stream, const T1* weights, const T2* grads, const T3* moment_1, @@ -130,7 +131,7 @@ void LambComputeDirection( int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); - _LambComputeDirectionImpl<<>>( + _LambComputeDirectionImpl<<>>( weights, grads, moment_1, @@ -152,6 +153,7 @@ void LambComputeDirection( #define SPECIALIZED_LAMB_COMPUTE_DIRECTION(T1, T2, T3, T_GRAD_NORM) \ template void LambComputeDirection( \ + cudaStream_t stream, \ const T1* weights, \ const T2* grads, \ const T3* moment_1, \ @@ -256,6 +258,7 @@ __global__ void _LambUpdateImpl( template void LambUpdate( + cudaStream_t stream, const T1* eta, const float ratio_min, const float ratio_max, @@ -270,7 +273,7 @@ void LambUpdate( int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); - _LambUpdateImpl<<>>( + _LambUpdateImpl<<>>( eta, ratio_min, ratio_max, @@ -286,6 +289,7 @@ void LambUpdate( #define INSTANTIATE_LAMB_UPDATE(T1, T2, T3, T_MIXED_PRECISION_FP) \ template void LambUpdate( \ + cudaStream_t stream, \ const T1* eta, \ const float ratio_min, \ const float ratio_max, \ @@ -356,6 +360,7 @@ __global__ void LambMultiTensorComputeDirectionImpl( template void LambMultiTensorComputeDirectionFunctor::operator()( + cudaStream_t stream, ChunkGroup<6> chunk_group, const T1* loss_scale, const T_GRAD_NORM* g_norm, @@ -369,7 +374,7 @@ void LambMultiTensorComputeDirectionFunctor::operator() const int thread_count = ChunkGroup<6>::thread_count_per_block; const int block_count = chunk_group.chunk_count; - LambMultiTensorComputeDirectionImpl<<>>( + LambMultiTensorComputeDirectionImpl<<>>( chunk_group, loss_scale, g_norm, @@ -384,6 +389,7 @@ void LambMultiTensorComputeDirectionFunctor::operator() #define INSTANTIATE_LAMB_STAGE1_MULTI_TENSOR_FUNCTOR(T1, T2, T3, T_GRAD_NORM) \ template void LambMultiTensorComputeDirectionFunctor::operator()( \ + cudaStream_t stream, \ ChunkGroup<6> chunk_group, \ const T1* loss_scale, \ const T_GRAD_NORM* g_norm, \ @@ -445,6 +451,7 @@ __global__ void LambMultiTensorUpdateImpl( template void LambMultiTensorUpdateFunctor::operator()( + cudaStream_t stream, ChunkGroup<7> chunk_group, const T1* eta, const float ratio_min, @@ -452,7 +459,7 @@ void LambMultiTensorUpdateFunctor::operator()( const int thread_count = ChunkGroup<7>::thread_count_per_block; const int block_count = chunk_group.chunk_count; - LambMultiTensorUpdateImpl<<>>( + LambMultiTensorUpdateImpl<<>>( chunk_group, eta, ratio_min, @@ -461,6 +468,7 @@ void LambMultiTensorUpdateFunctor::operator()( #define INSTANTIATE_LAMB_MULTI_TENSOR_UPDATE_FUNCTOR(T1, T2, T3, T_MIXED_PRECISION_FP) \ template void LambMultiTensorUpdateFunctor::operator()( \ + cudaStream_t stream, \ ChunkGroup<7> chunk_group, \ const T1* eta, \ const float ratio_min, \ @@ -616,7 +624,7 @@ CudaKernel::CudaAsyncBuffer compute_tensor_rang } template -void LambMultiTensorReductionFunctor::operator()(ChunkGroup<4> chunk_group, const CudaKernel& kernel, void* reduction_buffer, size_t reduction_buffer_size) { +void LambMultiTensorReductionFunctor::operator()(cudaStream_t stream, ChunkGroup<4> chunk_group, const CudaKernel& kernel, void* reduction_buffer, size_t reduction_buffer_size) { // thread count per block. constexpr int thread_count = ChunkGroup<4>::thread_count_per_block; // shared memory's size per block. @@ -636,12 +644,12 @@ void LambMultiTensorReductionFunctor::operator() TOut2* d_buffer = reinterpret_cast(w_buffer + num_blocks); auto sync_range_and_lock = compute_tensor_range_and_lock(chunk_group, kernel); - LambMultiTensorReductionImpl<<>>( + LambMultiTensorReductionImpl<<>>( chunk_group, w_buffer, d_buffer, sync_range_and_lock.GpuPtr()); } #define INSTANTIATE_LAMB_MULTI_TENSOR_REDUCTION_FUNCTOR(TIn1, TIn2, TOut1, TOut2, TBuf) \ - template void LambMultiTensorReductionFunctor::operator()(ChunkGroup<4> chunk_group, const CudaKernel& kernel, void* reduction_buffer, size_t reduction_buffer_size); + template void LambMultiTensorReductionFunctor::operator()(cudaStream_t stream, ChunkGroup<4> chunk_group, const CudaKernel& kernel, void* reduction_buffer, size_t reduction_buffer_size); INSTANTIATE_LAMB_MULTI_TENSOR_REDUCTION_FUNCTOR(float, float, float, float, float) INSTANTIATE_LAMB_MULTI_TENSOR_REDUCTION_FUNCTOR(double, double, double, double, double) diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h index 7882a94759..d5bf742a1b 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h +++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h @@ -49,6 +49,7 @@ class LambOptimizer final : public CudaKernel { // of this. template void LambComputeDirection( + cudaStream_t stream, const T1* weights, const T2* grads, const T3* moment_1, @@ -73,6 +74,7 @@ void LambComputeDirection( // of this. template void LambUpdate( + cudaStream_t stream, const T1* eta, const float ratio_min, const float ratio_max, @@ -106,6 +108,7 @@ void LambUpdate( template struct LambMultiTensorComputeDirectionFunctor { void operator()( + cudaStream_t stream, ChunkGroup<6> chunk_group, const T1* loss_scale, const T_GRAD_NORM* grad_norm, @@ -134,6 +137,7 @@ struct LambMultiTensorComputeDirectionFunctor { template struct LambMultiTensorReductionFunctor { void operator()( + cudaStream_t stream, ChunkGroup<4> chunk_group, const CudaKernel& kernel, void* reduction_buffer, @@ -183,6 +187,7 @@ struct LambMultiTensorSyncRangeAndLock { template struct LambMultiTensorUpdateFunctor { void operator()( + cudaStream_t stream, ChunkGroup<7> chunk_group, const T1* eta, const float ratio_min, diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/sg.cc b/orttraining/orttraining/training_ops/cuda/optimizer/sg.cc index 2501faa82c..048f1da8e7 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/sg.cc +++ b/orttraining/orttraining/training_ops/cuda/optimizer/sg.cc @@ -30,6 +30,7 @@ Status SGDOptimizer::ComputeInternal(OpKernelContext* ctx) const { ORT_ENFORCE(W.Shape() == G.Shape()); SGDOptimizerImpl( + Stream(), ETA.template Data(), W.template Data(), G.template Data(), diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/sg.cu b/orttraining/orttraining/training_ops/cuda/optimizer/sg.cu index addfed2f7b..aeab19d5eb 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/sg.cu +++ b/orttraining/orttraining/training_ops/cuda/optimizer/sg.cu @@ -31,6 +31,7 @@ __global__ void _SGDOptimizer( template void SGDOptimizerImpl( + cudaStream_t stream, const T* eta, const T* weights, const T* gradients, @@ -39,7 +40,7 @@ void SGDOptimizerImpl( size_t count) { int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); - _SGDOptimizer<<>>( + _SGDOptimizer<<>>( eta, weights, gradients, @@ -50,6 +51,7 @@ void SGDOptimizerImpl( #define SPECIALIZED_IMPL__SGDOptimizerImpl(T) \ template void SGDOptimizerImpl( \ + cudaStream_t stream, \ const T* eta, \ const T* weights, \ const T* gradients, \ diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/sg.h b/orttraining/orttraining/training_ops/cuda/optimizer/sg.h index 99d81f6984..80d47a8fa0 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/sg.h +++ b/orttraining/orttraining/training_ops/cuda/optimizer/sg.h @@ -10,6 +10,7 @@ namespace cuda { template void SGDOptimizerImpl( + cudaStream_t stream, const T* eta, const T* weights, const T* gradients, diff --git a/orttraining/orttraining/training_ops/cuda/reduction/all.cc b/orttraining/orttraining/training_ops/cuda/reduction/all.cc index 3e8741a9d6..9e1c282667 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/all.cc +++ b/orttraining/orttraining/training_ops/cuda/reduction/all.cc @@ -25,11 +25,14 @@ Status All::ComputeInternal(OpKernelContext* ctx) const { ORT_ENFORCE(size <= std::numeric_limits::max(), "Number of reduced elements (", size, ") exceeds the max allowed value (", std::numeric_limits::max(), ")."); + // TODO: LaunchAllKernel is implemented with thrust, which always uses default CUDA stream. + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(Stream())); LaunchAllKernel( + Stream(), input.Data(), static_cast(size), output.MutableData()); - + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(0)); return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/cuda/reduction/all.cu b/orttraining/orttraining/training_ops/cuda/reduction/all.cu index 64c62523d5..678d01893d 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/all.cu +++ b/orttraining/orttraining/training_ops/cuda/reduction/all.cu @@ -21,13 +21,13 @@ __global__ void assign_false(bool* ptr) { } template<> -void LaunchAllKernel(const bool* data, const int size, bool* output) { +void LaunchAllKernel(cudaStream_t stream, const bool* data, const int size, bool* output) { if(thrust::all_of(thrust::device, data, data + size, thrust::identity())) { - assign_true<<<1, 1, 0>>>(output); + assign_true<<<1, 1, 0, stream>>>(output); } else { - assign_false<<<1, 1, 0>>>(output); + assign_false<<<1, 1, 0, stream>>>(output); } } diff --git a/orttraining/orttraining/training_ops/cuda/reduction/all.h b/orttraining/orttraining/training_ops/cuda/reduction/all.h index 7e687cc7f9..f15f3fdff5 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/all.h +++ b/orttraining/orttraining/training_ops/cuda/reduction/all.h @@ -16,7 +16,7 @@ class All final : public CudaKernel { }; template -void LaunchAllKernel(const T* data, const int size, bool* output); +void LaunchAllKernel(cudaStream_t stream, const T* data, const int size, bool* output); } // namespace cuda } // namespace onnxruntime diff --git a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cc b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cc index 5d90e9936b..654b915ffe 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cc +++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cc @@ -44,7 +44,7 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { // Allocate output tensor. Tensor* output = ctx->Output(0, {}); CudaTOut* p_output = reinterpret_cast(output->template MutableData()); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(p_output, 0, sizeof(CudaTOut))); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(p_output, 0, sizeof(CudaTOut), Stream())); const bool deterministic = ctx->GetUseDeterministicCompute(); @@ -54,12 +54,12 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { // Check if all values are finite and write true to deviceOutput. // Otherwise, false will be written. - launch_multi_tensor_functor<1, TFunctor>( + launch_multi_tensor_functor<1, TFunctor>(Stream(), 2048 * 32, tensor_sizes, grouped_tensor_pointers, functor, p_output); // *p_output is the squared sum of all elements. // Let's take a sqrt to get the actual L2-norm. - ScalarSqrt(p_output, p_output); + ScalarSqrt(Stream(), p_output, p_output); } else { // alternate path only for deterministic compute .. typedef AccumulationType_t CudaTAcc; @@ -81,7 +81,7 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { // buffer for final output and square norms of each tensor auto results_buffer = GetScratchBuffer(1 + total_tensor_count); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(results_buffer.get(), 0, sizeof(CudaTAcc) * (1 + total_tensor_count))); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(results_buffer.get(), 0, sizeof(CudaTAcc) * (1 + total_tensor_count), Stream())); CudaTAcc* p_global_sqnorm = results_buffer.get(); CudaTAcc* p_tensor_sqnorm = p_global_sqnorm + 1; @@ -90,11 +90,11 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { for (int i = 0; i < total_tensor_count; ++i) { CudaTIn* p_tensor_i = reinterpret_cast(grouped_tensor_pointers[i][0]); ORT_RETURN_IF_ERROR(reduce_square_sum( - p_tensor_i, p_tensor_sqnorm + i, tensor_sizes[i], reduction_buffer.get(), reduction_buffer_size)); + Stream(), p_tensor_i, p_tensor_sqnorm + i, tensor_sizes[i], reduction_buffer.get(), reduction_buffer_size)); } ORT_RETURN_IF_ERROR(reduce_sum( - p_tensor_sqnorm, p_global_sqnorm, total_tensor_count, reduction_buffer.get(), reduction_buffer_size)); - ScalarSqrt(p_global_sqnorm, p_output); + Stream(), p_tensor_sqnorm, p_global_sqnorm, total_tensor_count, reduction_buffer.get(), reduction_buffer_size)); + ScalarSqrt(Stream(), p_global_sqnorm, p_output); } return Status::OK(); diff --git a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu index adde87d307..16603e1ade 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu +++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu @@ -17,16 +17,16 @@ __global__ void ScalarSqrtKernel(Tin* input, Tout* output) { } template -void ScalarSqrt(Tin* input, Tout* output) { - ScalarSqrtKernel<<<1, 1, 0>>>(input, output); -} +void ScalarSqrt(cudaStream_t stream, Tin* input, Tout* output) { + ScalarSqrtKernel<<<1, 1, 0, stream>>>(input, output); +}; -template void ScalarSqrt(float* input, float* output); -template void ScalarSqrt(half* input, half* output); -template void ScalarSqrt(float* input, half* output); +template void ScalarSqrt(cudaStream_t stream, float* input, float* output); +template void ScalarSqrt(cudaStream_t stream, half* input, half* output); +template void ScalarSqrt(cudaStream_t stream, float* input, half* output); #if CUDA_VERSION >= 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) -template void ScalarSqrt(nv_bfloat16* input, nv_bfloat16* output); -template void ScalarSqrt(float* input, nv_bfloat16* output); +template void ScalarSqrt(cudaStream_t stream, nv_bfloat16* input, nv_bfloat16* output); +template void ScalarSqrt(cudaStream_t stream, float* input, nv_bfloat16* output); #endif template @@ -87,7 +87,7 @@ __global__ void MultiTensorReduceKernel(ChunkGroup<1> chunk_group, TOut* output) } template -void MultiTensorReduce(ChunkGroup<1> chunk_group, TOut* output) { +void MultiTensorReduce(cudaStream_t stream, ChunkGroup<1> chunk_group, TOut* output) { // thread count per block. constexpr int thread_count = ChunkGroup<1>::thread_count_per_block; // shared memory's size per block. @@ -97,17 +97,17 @@ void MultiTensorReduce(ChunkGroup<1> chunk_group, TOut* output) { ORT_ENFORCE(thread_count % GPU_WARP_SIZE == 0); ORT_ENFORCE((thread_count & (thread_count - 1)) == 0); - MultiTensorReduceKernel<<>>(chunk_group, output); + MultiTensorReduceKernel<<>>(chunk_group, output); } template -void MultiTensorReduceL2::operator()(ChunkGroup<1> chunk_group, TOut* output) { +void MultiTensorReduceL2::operator()(cudaStream_t stream, ChunkGroup<1> chunk_group, TOut* output) { using TBuf = AccumulationType_t; - MultiTensorReduce(chunk_group, output); + MultiTensorReduce(stream, chunk_group, output); } #define INSTANTIATE_MULTI_TENSOR_REDUCTION_L2_FUNCTOR(TIn, TOut) \ - template void MultiTensorReduceL2::operator()(ChunkGroup<1> chunk_group, TOut* output); + template void MultiTensorReduceL2::operator()(cudaStream_t stream, ChunkGroup<1> chunk_group, TOut* output); INSTANTIATE_MULTI_TENSOR_REDUCTION_L2_FUNCTOR(double, float) INSTANTIATE_MULTI_TENSOR_REDUCTION_L2_FUNCTOR(float, float) diff --git a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.h b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.h index f3ea5130eb..7de6e2ee9b 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.h +++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.h @@ -18,11 +18,11 @@ class ReduceAllL2 final : public CudaKernel { template struct MultiTensorReduceL2 { - void operator()(ChunkGroup<1> chunk_group, TOut* output); + void operator()(cudaStream_t stream, ChunkGroup<1> chunk_group, TOut* output); }; template -void ScalarSqrt(Tin* input, Tout* output); +void ScalarSqrt(cudaStream_t stream, Tin* input, Tout* output); } // namespace cuda } // namespace onnxruntime diff --git a/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc b/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc index deb316a562..adfaeff30f 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc +++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc @@ -47,7 +47,7 @@ Status ReduceKernel::ComputeImplEx(OpKernelContext* ctx, cudnn // empty axes and no-op if (axes.empty() && noop_with_empty_axes_) { auto* Y = ctx->Output(0, X->Shape()); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), cudaMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -80,7 +80,7 @@ Status ReduceKernel::ComputeImplExOutput(0, X->Shape()); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), cudaMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -107,14 +107,14 @@ Status ReduceKernel::ComputeImplExtemplate MutableData() != X->template Data()) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), input_count * sizeof(int32_t), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), input_count * sizeof(int32_t), cudaMemcpyDeviceToDevice, Stream())); } return Status::OK(); } // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. // Therefore zeroing out the memory is required - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes(), Stream())); size_t indices_bytes = 0; size_t workspace_bytes = 0; @@ -124,7 +124,7 @@ Status ReduceKernel::ComputeImplEx temp_X = GetScratchBuffer(input_count); - Impl_Cast(reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); + Impl_Cast(Stream(), reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, CUDNN_REDUCE_TENSOR_FLATTENED_INDICES)); ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_cudnn, cudnn_type_X)); @@ -150,7 +150,7 @@ Status ReduceKernel::ComputeImplEx(temp_Y.get(), Y->template MutableData(), output_count); + Impl_Cast(Stream(), temp_Y.get(), Y->template MutableData(), output_count); return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/cuda/tensor/concat.cc b/orttraining/orttraining/training_ops/cuda/tensor/concat.cc index 3185f3eb6f..0404c1fa4e 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/concat.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/concat.cc @@ -60,7 +60,8 @@ Status ConcatTraining::ComputeInternal(OpKernelContext* ctx) const { int block_size_inside_axis_dim = static_cast(p.output_axis_pitch / p.output_tensor->Shape()[p.axis]); int block_size_including_axis_dim = static_cast(p.output_axis_pitch); auto element_bytes = p.output_tensor->DataType()->Size(); - ORT_RETURN_IF_ERROR(ConcatImpl(element_bytes, + ORT_RETURN_IF_ERROR(ConcatImpl(Stream(), + element_bytes, block_size_including_axis_dim, block_size_inside_axis_dim, concat_sizes_gpu.GpuPtr(), @@ -71,7 +72,7 @@ Status ConcatTraining::ComputeInternal(OpKernelContext* ctx) const { p.output_num_elements)); Tensor* output_1_tensor = ctx->Output(1, {input_count}); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_1_tensor->template MutableData(), concat_sizes_gpu.GpuPtr(), input_count * sizeof(int64_t), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_1_tensor->template MutableData(), concat_sizes_gpu.GpuPtr(), input_count * sizeof(int64_t), cudaMemcpyDeviceToDevice, Stream())); return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc index eb24e3eb57..983960a2df 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc @@ -24,7 +24,8 @@ ONNX_OPERATOR_KERNEL_EX( template struct GatherElementsGrad::ComputeImpl { - Status operator()(const Tensor* dY, + Status operator()(cudaStream_t stream, + const Tensor* dY, const Tensor* indices_tensor, Tensor* dX, const int rank, @@ -42,6 +43,7 @@ struct GatherElementsGrad::ComputeImpl { if (utils::IsPrimitiveDataType(Tin_type)) { const int32_t* indices_data = indices_tensor->template Data(); return GatherElementsGradImpl( + stream, rank, buffer_output_dims, buffer_input_strides, @@ -55,6 +57,7 @@ struct GatherElementsGrad::ComputeImpl { } else if (utils::IsPrimitiveDataType(Tin_type)) { const int64_t* indices_data = indices_tensor->template Data(); return GatherElementsGradImpl( + stream, rank, buffer_output_dims, buffer_input_strides, @@ -113,7 +116,7 @@ Status GatherElementsGrad::ComputeInternal(OpKernelContext* context) const { int rank = static_cast(output_dims.size()); Tensor* dX = context->Output(0, data_shape); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(dX->MutableDataRaw(), 0, dX->SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(dX->MutableDataRaw(), 0, dX->SizeInBytes(), Stream())); TArray buffer_output_dims(output_dims); TensorPitches input_strides(output_dims); @@ -128,7 +131,7 @@ Status GatherElementsGrad::ComputeInternal(OpKernelContext* context) const { utils::MLTypeCallDispatcherRet t_disp(dY->GetElementType()); - return t_disp.Invoke(dY, indices_tensor, dX, rank, + return t_disp.Invoke(Stream(), dY, indices_tensor, dX, rank, buffer_output_dims, buffer_input_strides, indices_size, buffer_indices_dims, fdm_indices_strides, axis); } diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h index 713fe3f7bc..c6873c301a 100755 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h @@ -11,6 +11,7 @@ namespace cuda { template Status GatherElementsGradImpl( + cudaStream_t stream, const int rank, TArray& buffer_input_dims, TArray& buffer_input_strides, diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_grad.cc b/orttraining/orttraining/training_ops/cuda/tensor/gather_grad.cc index 41607cb88d..9a89cda8c1 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_grad.cc @@ -36,6 +36,7 @@ ONNX_OPERATOR_KERNEL_EX( namespace { template Status CallGatherGradImpl( + cudaStream_t stream, const CudaScratchBufferAllocator& allocator, int64_t num_gathered_per_index, int64_t gather_dimension_size, int64_t num_batches, const Tensor& dY, const Tensor& gathered_indices, @@ -49,6 +50,7 @@ Status CallGatherGradImpl( const SafeInt num_gathered_indices{gathered_indices.Shape().Size()}; GatherGradImpl( + stream, allocator, reinterpret_cast(dY_data), indices_data, @@ -63,6 +65,7 @@ Status CallGatherGradImpl( template Status DispatchToGatherGradImplByTindex( + cudaStream_t stream, MLDataType tindex_data_type, const CudaScratchBufferAllocator& allocator, int64_t num_gathered_per_index, int64_t gather_dimension_size, int64_t num_batches, @@ -70,16 +73,17 @@ Status DispatchToGatherGradImplByTindex( Tensor& dX) { if (utils::IsPrimitiveDataType(tindex_data_type)) { return CallGatherGradImpl( - allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); + stream, allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); } else if (utils::IsPrimitiveDataType(tindex_data_type)) { return CallGatherGradImpl( - allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); + stream, allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); } return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "GatherGrad unsupported TIndex type: ", tindex_data_type); } Status DispatchToGatherGradImpl( + cudaStream_t stream, MLDataType t_data_type, MLDataType tindex_data_type, const CudaScratchBufferAllocator& allocator, int64_t num_gathered_per_index, int64_t gather_dimension_size, int64_t num_batches, @@ -87,14 +91,14 @@ Status DispatchToGatherGradImpl( Tensor& dX) { if (utils::IsPrimitiveDataType(t_data_type)) { return DispatchToGatherGradImplByTindex( - tindex_data_type, allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); + stream, tindex_data_type, allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); } else if (utils::IsPrimitiveDataType(t_data_type)) { return DispatchToGatherGradImplByTindex( - tindex_data_type, allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); + stream, tindex_data_type, allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 } else if (utils::IsPrimitiveDataType(t_data_type)) { return DispatchToGatherGradImplByTindex( - tindex_data_type, allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); + stream, tindex_data_type, allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); #endif } @@ -109,7 +113,7 @@ Status GatherGrad::ComputeInternal(OpKernelContext* context) const { const Tensor* dY = context->Input(2); Tensor* dX = context->Output(0, X_shape); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(dX->MutableDataRaw(), 0, dX->SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(dX->MutableDataRaw(), 0, dX->SizeInBytes(), Stream())); if (gathered_indices->Shape().Size() == 0) { // nothing else to do @@ -125,7 +129,7 @@ Status GatherGrad::ComputeInternal(OpKernelContext* context) const { const int64_t num_batches = X_shape.SizeToDimension(axis); return DispatchToGatherGradImpl( - t_type, tindex_type, CudaScratchBufferAllocator{*this}, + Stream(), t_type, tindex_type, CudaScratchBufferAllocator{*this}, num_gathered_per_index, gather_dimension_size, num_batches, *dY, *gathered_indices, *dX); } diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu index 60713126ae..9c0537c81d 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu @@ -51,6 +51,7 @@ __global__ void CopyKernel(TOutputIterator dst, TInputIterator src, int64_t leng // get sorted dX and dY indices, ordered by dX indices template void GetSortedIndices( + cudaStream_t stream, const CudaScratchBufferAllocator& allocator, const TIndex* dX_indices, GatheredIndexIndex_t num_gathered_indices, @@ -58,7 +59,7 @@ void GetSortedIndices( IAllocatorUniquePtr& dY_indices_sorted_out) { auto dY_indices = allocator.GetScratchBuffer(num_gathered_indices); CopyKernel<<>>( + GridDim::maxThreadsPerBlock, 0, stream>>>( dY_indices.get(), cub::CountingInputIterator{0}, num_gathered_indices); auto dX_indices_sorted = allocator.GetScratchBuffer(num_gathered_indices); @@ -69,14 +70,14 @@ void GetSortedIndices( nullptr, temp_storage_size_bytes, dX_indices, dX_indices_sorted.get(), dY_indices.get(), dY_indices_sorted.get(), - num_gathered_indices)); + num_gathered_indices, 0, sizeof(TIndex)*8, stream)); auto temp_storage = allocator.GetScratchBuffer(temp_storage_size_bytes); CUDA_CALL_THROW(cub::DeviceRadixSort::SortPairs( temp_storage.get(), temp_storage_size_bytes, dX_indices, dX_indices_sorted.get(), dY_indices.get(), dY_indices_sorted.get(), - num_gathered_indices)); + num_gathered_indices, 0, sizeof(TIndex)*8, stream)); dX_indices_sorted_out = std::move(dX_indices_sorted); dY_indices_sorted_out = std::move(dY_indices_sorted); @@ -84,18 +85,19 @@ void GetSortedIndices( template IAllocatorUniquePtr GetOffsetsFromCounts( + cudaStream_t stream, const CudaScratchBufferAllocator& allocator, const T* counts, int32_t num_counts) { auto offsets = allocator.GetScratchBuffer(num_counts); size_t temp_storage_size_bytes = 0; CUDA_CALL_THROW(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_size_bytes, - counts, offsets.get(), num_counts)); + counts, offsets.get(), num_counts, stream)); auto temp_storage = allocator.GetScratchBuffer(temp_storage_size_bytes); CUDA_CALL_THROW(cub::DeviceScan::ExclusiveSum( temp_storage.get(), temp_storage_size_bytes, - counts, offsets.get(), num_counts)); + counts, offsets.get(), num_counts, stream)); return offsets; } @@ -157,6 +159,7 @@ __global__ void DirectSumKernel( // directly sum gathered dY values into the corresponding dX value template void DirectSumImpl( + cudaStream_t stream, const TIndex* dX_indices_sorted, const TIndex* dY_indices_sorted, const T* dY_data, @@ -168,7 +171,7 @@ void DirectSumImpl( dim3 block(GPU_WARP_SIZE, 4); dim3 grid(CeilDiv(num_gathered_indices, 4), CeilDiv(num_gathered_per_index, 128)); - DirectSumKernel<<>>( + DirectSumKernel<<>>( dX_indices_sorted, dY_indices_sorted, dY_data, @@ -299,6 +302,7 @@ __global__ void ComputeSegmentSumsAndScatterKernel( // the corresponding dX value template void PartialSumsImpl( + cudaStream_t stream, const CudaScratchBufferAllocator& allocator, const TIndex* dX_indices_sorted, const TIndex* dY_indices_sorted, @@ -317,29 +321,30 @@ void PartialSumsImpl( auto per_segment_partial_segment_counts = allocator.GetScratchBuffer(num_segments); { const auto blocks_per_grid = CeilDiv(num_gathered_indices, GridDim::maxThreadsPerBlock); - ComputePerSegmentPartialSegmentCountsKernel<<>>( + ComputePerSegmentPartialSegmentCountsKernel<<>>( per_segment_partial_segment_counts.get(), segment_offsets, num_segments, num_gathered_indices); } // compute partial segment offsets per segment auto per_segment_partial_segment_offsets = GetOffsetsFromCounts( - allocator, per_segment_partial_segment_counts.get(), num_segments); + stream, allocator, per_segment_partial_segment_counts.get(), num_segments); SegmentIndex_t host_num_partial_segments = 0; { SegmentIndex_t last_segment_partial_segment_offset = 0, last_segment_partial_segment_count = 0; // CPU/GPU sync! - CUDA_CALL_THROW(cudaMemcpy( + CUDA_CALL_THROW(cudaMemcpyAsync( &last_segment_partial_segment_offset, &per_segment_partial_segment_offsets.get()[num_segments - 1], - sizeof(SegmentIndex_t), cudaMemcpyDeviceToHost)); + sizeof(SegmentIndex_t), cudaMemcpyDeviceToHost, stream)); // CPU/GPU sync! - CUDA_CALL_THROW(cudaMemcpy( + CUDA_CALL_THROW(cudaMemcpyAsync( &last_segment_partial_segment_count, &per_segment_partial_segment_counts.get()[num_segments - 1], - sizeof(SegmentIndex_t), cudaMemcpyDeviceToHost)); + sizeof(SegmentIndex_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CALL_THROW(cudaStreamSynchronize(stream)); host_num_partial_segments = last_segment_partial_segment_offset + last_segment_partial_segment_count; } @@ -348,7 +353,7 @@ void PartialSumsImpl( auto partial_segment_offsets = allocator.GetScratchBuffer(host_num_partial_segments); { const auto blocks_per_grid = CeilDiv(num_segments, GridDim::maxThreadsPerBlock); - ComputePartialSegmentOffsetsKernel<<>>( + ComputePartialSegmentOffsetsKernel<<>>( partial_segment_offsets.get(), per_segment_partial_segment_counts.get(), per_segment_partial_segment_offsets.get(), @@ -369,7 +374,7 @@ void PartialSumsImpl( const dim3 blocks_per_grid( CeilDiv(host_num_partial_segments * num_gathered_per_index_warp_size_multiple, threads_per_block), num_batches); - ComputePartialSegmentSumsKernel<<>>( + ComputePartialSegmentSumsKernel<<>>( dY_indices_sorted, dY_data, num_gathered_indices, @@ -385,7 +390,7 @@ void PartialSumsImpl( const dim3 blocks_per_grid( CeilDiv(num_segments * num_gathered_per_index_warp_size_multiple, threads_per_block), num_batches); - ComputeSegmentSumsAndScatterKernel<<>>( + ComputeSegmentSumsAndScatterKernel<<>>( dX_indices_sorted, dX_data, num_gathered_per_index, @@ -402,6 +407,7 @@ void PartialSumsImpl( template void Impl( + cudaStream_t stream, const CudaScratchBufferAllocator& allocator, const T* dY_data, const TIndex* dX_indices, @@ -412,6 +418,7 @@ void Impl( T* dX_data) { IAllocatorUniquePtr dX_indices_sorted, dY_indices_sorted; GetSortedIndices( + stream, allocator, dX_indices, num_gathered_indices, dX_indices_sorted, dY_indices_sorted); @@ -425,17 +432,18 @@ void Impl( CUDA_CALL_THROW(cub::DeviceRunLengthEncode::Encode( nullptr, temp_storage_size_bytes, dX_indices_sorted.get(), cub::DiscardOutputIterator{}, segment_counts.get(), - num_segments.get(), num_gathered_indices)); + num_segments.get(), num_gathered_indices, stream)); auto temp_storage = allocator.GetScratchBuffer(temp_storage_size_bytes); CUDA_CALL_THROW(cub::DeviceRunLengthEncode::Encode( temp_storage.get(), temp_storage_size_bytes, dX_indices_sorted.get(), cub::DiscardOutputIterator{}, segment_counts.get(), - num_segments.get(), num_gathered_indices)); + num_segments.get(), num_gathered_indices, stream)); // CPU/GPU sync! - CUDA_CALL_THROW(cudaMemcpy( - &host_num_segments, num_segments.get(), sizeof(SegmentIndex_t), cudaMemcpyDeviceToHost)); + CUDA_CALL_THROW(cudaMemcpyAsync( + &host_num_segments, num_segments.get(), sizeof(SegmentIndex_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CALL_THROW(cudaStreamSynchronize(stream)); } // get largest segment size and use that to select implementation @@ -446,30 +454,32 @@ void Impl( size_t temp_storage_size_bytes = 0; CUDA_CALL_THROW(cub::DeviceReduce::Max( nullptr, temp_storage_size_bytes, - segment_counts.get(), max_segment_count.get(), host_num_segments)); + segment_counts.get(), max_segment_count.get(), host_num_segments, stream)); auto temp_storage = allocator.GetScratchBuffer(temp_storage_size_bytes); CUDA_CALL_THROW(cub::DeviceReduce::Max( temp_storage.get(), temp_storage_size_bytes, - segment_counts.get(), max_segment_count.get(), host_num_segments)); + segment_counts.get(), max_segment_count.get(), host_num_segments, stream)); // CPU/GPU sync! - CUDA_CALL_THROW(cudaMemcpy( - &host_max_segment_count, max_segment_count.get(), sizeof(GatheredIndexIndex_t), cudaMemcpyDeviceToHost)); + CUDA_CALL_THROW(cudaMemcpyAsync( + &host_max_segment_count, max_segment_count.get(), sizeof(GatheredIndexIndex_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CALL_THROW(cudaStreamSynchronize(stream)); } constexpr GatheredIndexIndex_t kMaxSegmentSizeThreshold = 32; if (host_max_segment_count <= kMaxSegmentSizeThreshold) { DirectSumImpl( - dX_indices_sorted.get(), dY_indices_sorted.get(), + stream, dX_indices_sorted.get(), dY_indices_sorted.get(), dY_data, dX_data, num_gathered_indices, num_gathered_per_index, gather_dimension_size, num_batches); } else { auto segment_offsets = GetOffsetsFromCounts( - allocator, segment_counts.get(), host_num_segments); + stream, allocator, segment_counts.get(), host_num_segments); segment_counts.reset(); PartialSumsImpl( + stream, allocator, dX_indices_sorted.get(), dY_indices_sorted.get(), dY_data, dX_data, @@ -482,6 +492,7 @@ void Impl( // doesn't perform well if there are many duplicate values in dX_indices template void Impl_Simplified( + cudaStream_t stream, const CudaScratchBufferAllocator& allocator, const T* dY_data, const TIndex* dX_indices, @@ -492,6 +503,7 @@ void Impl_Simplified( T* dX_data) { IAllocatorUniquePtr dX_indices_sorted, dY_indices_sorted; GetSortedIndices( + stream, allocator, dX_indices, num_gathered_indices, dX_indices_sorted, dY_indices_sorted); @@ -499,7 +511,7 @@ void Impl_Simplified( dim3 block(GPU_WARP_SIZE, 4); dim3 grid(CeilDiv(num_gathered_indices, 4), CeilDiv(num_gathered_per_index, 128)); - DirectSumKernel<<>>( + DirectSumKernel<<>>( dX_indices_sorted.get(), dY_indices_sorted.get(), dY_data, @@ -514,6 +526,7 @@ void Impl_Simplified( template void GatherGradImpl( + cudaStream_t stream, const CudaScratchBufferAllocator& allocator, const T* dY_data, const TIndex* dX_indices, @@ -523,6 +536,7 @@ void GatherGradImpl( const int64_t num_batches, T* dX_data) { gather_grad_internal::Impl( + stream, allocator, dY_data, dX_indices, num_gathered_indices, gather_dimension_size, num_gathered_per_index, num_batches, @@ -531,6 +545,7 @@ void GatherGradImpl( #define SPECIALIZED(T, TIndex) \ template void GatherGradImpl( \ + cudaStream_t stream, \ const CudaScratchBufferAllocator& allocator, \ const T* dY_data, \ const TIndex* dX_indices, \ diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.h b/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.h index 4a174da99f..a792e08f10 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.h +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.h @@ -28,6 +28,7 @@ using GatheredIndexIndex_t = int32_t; template void GatherGradImpl( + cudaStream_t stream, const CudaScratchBufferAllocator& allocator, const T* dY_data, const TIndex* dX_indices, diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc index b7dad4963b..90ea1bca1b 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc @@ -36,13 +36,15 @@ REGISTER_KERNEL_TYPED_GATHER_ND_GRAD(int64_t) template struct GatherNDGradComputeImpl { - void operator()(const int64_t num_slices, + void operator()(cudaStream_t stream, + const int64_t num_slices, const int64_t slice_size, const void* const kernel_input_data, void* const kernel_output_data, int64_t* const input_slice_offsets_data) const { typedef typename ToCudaType::MappedType CudaT; - GatherNDGradImpl(num_slices, kernel_input_data, + GatherNDGradImpl(stream, + num_slices, kernel_input_data, kernel_output_data, slice_size, input_slice_offsets_data); } @@ -82,20 +84,21 @@ Status GatherNDGrad::ComputeInternal(OpKernelContext* context) const { auto output_tensor = context->Output(0, input_shape); // TODO this memset can be expensive, a sparse tensor representation would help here - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output_tensor->MutableDataRaw(), 0, output_tensor->SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output_tensor->MutableDataRaw(), 0, output_tensor->SizeInBytes(), Stream())); // Compute int64_t num_slices; int64_t slice_size; IAllocatorUniquePtr input_slice_offsets_buffer; - ORT_RETURN_IF_ERROR(PrepareCompute(batch_dims_, input_shape, indices_shape, indices_tensor, + ORT_RETURN_IF_ERROR(PrepareCompute(Stream(), + batch_dims_, input_shape, indices_shape, indices_tensor, num_slices, slice_size, input_slice_offsets_buffer)); const void* const kernel_input_data = update_tensor->DataRaw(); void* const kernel_output_data = output_tensor->MutableDataRaw(); utils::MLTypeCallDispatcher t_disp(update_tensor->GetElementType()); - t_disp.Invoke(num_slices, slice_size, kernel_input_data, kernel_output_data, input_slice_offsets_buffer.get()); + t_disp.Invoke(Stream(), num_slices, slice_size, kernel_input_data, kernel_output_data, input_slice_offsets_buffer.get()); return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad_impl.cu index d733887af9..8eddbd21a8 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad_impl.cu @@ -26,18 +26,19 @@ __global__ void _GatherNDGradKernel( template void GatherNDGradImpl( + cudaStream_t stream, const size_t num_slices, const void* update_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data) { const unsigned int blocks_per_grid = static_cast(CeilDiv(num_slices * slice_size, GridDim::maxThreadsPerBlock)); - _GatherNDGradKernel<<>>( + _GatherNDGradKernel<<>>( num_slices, static_cast(update_data), static_cast(output_data), slice_size, input_slice_offsets_data); } #define SPECIALIZED_GRAD_IMPL(T) \ - template void GatherNDGradImpl(const size_t num_slices, const void* update_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data) + template void GatherNDGradImpl(cudaStream_t stream, const size_t num_slices, const void* update_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data) SPECIALIZED_GRAD_IMPL(float); #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad_impl.h b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad_impl.h index 3b19e758e2..e00a3ed410 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad_impl.h +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad_impl.h @@ -9,6 +9,7 @@ namespace cuda { template void GatherNDGradImpl( + cudaStream_t stream, const size_t num_slices, const void* update_data, void* output_data, diff --git a/orttraining/orttraining/training_ops/cuda/tensor/slice_grad.cc b/orttraining/orttraining/training_ops/cuda/tensor/slice_grad.cc index 553d67bcc3..7004cbfb70 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/slice_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/slice_grad.cc @@ -52,8 +52,9 @@ Status SliceGrad::CallSliceImp(size_t element_size, size_t dimension_count, cons const TArray& output_strides, OpKernelContext* ctx, const TensorShape& output_shape) const { Tensor* gradient_out_tensor = GetOutputGradientTensor(ctx); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(gradient_out_tensor->MutableDataRaw(), 0, gradient_out_tensor->SizeInBytes())); - return SliceImplGrad(element_size, + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(gradient_out_tensor->MutableDataRaw(), 0, gradient_out_tensor->SizeInBytes(), Stream())); + return SliceImplGrad(Stream(), + element_size, gsl::narrow_cast(dimension_count), starts_buffer, steps_buffer, diff --git a/orttraining/orttraining/training_ops/cuda/tensor/split.cc b/orttraining/orttraining/training_ops/cuda/tensor/split.cc index 4a30b785a6..37a18c603f 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/split.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/split.cc @@ -82,7 +82,8 @@ Status SplitTraining::ComputeInternal(OpKernelContext* ctx) const { axis_dimension_input_output_mapping_gpu.CopyToGpu(); size_t element_size = input_tensor->DataType()->Size(); - ORT_RETURN_IF_ERROR(SplitImpl(element_size, + ORT_RETURN_IF_ERROR(SplitImpl(Stream(), + element_size, block_size_including_axis_dim, block_size_inside_axis_dim, split_sizes_gpu.GpuPtr(), diff --git a/orttraining/orttraining/training_ops/cuda/tensor/view.cc b/orttraining/orttraining/training_ops/cuda/tensor/view.cc index af6c140101..6d5d9da000 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/view.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/view.cc @@ -76,7 +76,7 @@ Status View::ComputeInternal(OpKernelContext* context) const { // View output is not sharing the underlaying buffer of input, copy instead const void* source = static_cast(X_data) + y_byte_offsets[i]; void* target = Y->MutableDataRaw(); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target, source, Y->SizeInBytes(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target, source, Y->SizeInBytes(), cudaMemcpyDeviceToDevice, Stream())); } else { Y->SetByteOffset(y_byte_offsets[i]); } diff --git a/orttraining/orttraining/training_ops/rocm/math/softmax_grad.cc b/orttraining/orttraining/training_ops/rocm/math/softmax_grad.cc index 766540b437..00039b7c42 100644 --- a/orttraining/orttraining/training_ops/rocm/math/softmax_grad.cc +++ b/orttraining/orttraining/training_ops/rocm/math/softmax_grad.cc @@ -13,6 +13,7 @@ namespace rocm { template Status SoftMaxGradComputeHelper( + hipStream_t stream, const T* dY, const TensorShape& input_shape, const T* Y, @@ -33,7 +34,7 @@ Status SoftMaxGradComputeHelper( if (D <= 1024 && D * sizeof(T) <= 4096) { dispatch_softmax_backward, is_log_softmax>( - dX_data, dY_data, Y_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); + stream, dX_data, dY_data, Y_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); return Status::OK(); } @@ -90,9 +91,9 @@ Status SoftmaxGrad::ComputeInternal(OpKernelContext* ctx) const { T* dX_data = dX->template MutableData(); if (log_softmax_) { - return SoftMaxGradComputeHelper(dY_data, input_shape, Y_data, dX_data, MiopenHandle(), axis_); + return SoftMaxGradComputeHelper(Stream(), dY_data, input_shape, Y_data, dX_data, MiopenHandle(), axis_); } else { - return SoftMaxGradComputeHelper(dY_data, input_shape, Y_data, dX_data, MiopenHandle(), axis_); + return SoftMaxGradComputeHelper(Stream(), dY_data, input_shape, Y_data, dX_data, MiopenHandle(), axis_); } } diff --git a/orttraining/orttraining/training_ops/rocm/math/softmax_grad_impl.cu b/orttraining/orttraining/training_ops/rocm/math/softmax_grad_impl.cu index c9c60c0706..2781435170 100644 --- a/orttraining/orttraining/training_ops/rocm/math/softmax_grad_impl.cu +++ b/orttraining/orttraining/training_ops/rocm/math/softmax_grad_impl.cu @@ -120,7 +120,7 @@ __global__ void softmax_warp_backward(output_t* gradInput, const input_t* grad, } template -void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const input_t* output, int softmax_elements, int softmax_elements_stride, int batch_count) { +void dispatch_softmax_backward(hipStream_t stream, output_t* grad_input, const input_t* grad, const input_t* output, int softmax_elements, int softmax_elements_stride, int batch_count) { if (softmax_elements == 0) { return; } else { @@ -144,37 +144,37 @@ void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const // Launch code would be more elegant if C++ supported FOR CONSTEXPR switch (log2_elements) { case 0: // 1 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 1: // 2 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 2: // 4 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 3: // 8 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 4: // 16 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 5: // 32 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 6: // 64 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 7: // 128 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 8: // 256 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 9: // 512 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 10: // 1024 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; default: break; @@ -183,8 +183,8 @@ void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const } #define SPECIALIZED_SOFTMAX_GRAD_IMPL(input_t, output_t, acc_t) \ -template void dispatch_softmax_backward(input_t * grad_input, const output_t* grad, const output_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); \ -template void dispatch_softmax_backward(input_t * grad_input, const output_t* grad, const output_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); +template void dispatch_softmax_backward(hipStream_t stream, input_t * grad_input, const output_t* grad, const output_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); \ +template void dispatch_softmax_backward(hipStream_t stream, input_t * grad_input, const output_t* grad, const output_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); SPECIALIZED_SOFTMAX_GRAD_IMPL(float, float, float) SPECIALIZED_SOFTMAX_GRAD_IMPL(half, half, float) diff --git a/orttraining/orttraining/training_ops/rocm/optimizer/adam.cc b/orttraining/orttraining/training_ops/rocm/optimizer/adam.cc index 2d378bff5b..6c554ce76e 100644 --- a/orttraining/orttraining/training_ops/rocm/optimizer/adam.cc +++ b/orttraining/orttraining/training_ops/rocm/optimizer/adam.cc @@ -103,20 +103,20 @@ Status AdamOptimizer: if (do_update_tensor != nullptr) { const bool do_update = *(do_update_tensor->template Data()); if (!do_update) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(M1, NM1)); - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(M2, NM2)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), M1, NM1)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), M2, NM2)); if (S_in != S_out) { *(S_out) = *(S_in); } if (NW != nullptr) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(W, *NW)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), W, *NW)); } if (NG != nullptr) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(G, *NG)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), G, *NG)); } if (W_MIXED_FP != nullptr && NW_MIXED_FP != nullptr) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(*W_MIXED_FP, *NW_MIXED_FP)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), *W_MIXED_FP, *NW_MIXED_FP)); } return Status::OK(); @@ -124,6 +124,7 @@ Status AdamOptimizer: } AdamOptimizerImpl( + Stream(), reinterpret_cast(ETA.template Data()), *S_in, reinterpret_cast(W.template Data()), diff --git a/orttraining/orttraining/training_ops/rocm/optimizer/adam.cu b/orttraining/orttraining/training_ops/rocm/optimizer/adam.cu index cfda9ddbca..aa05bf86d0 100644 --- a/orttraining/orttraining/training_ops/rocm/optimizer/adam.cu +++ b/orttraining/orttraining/training_ops/rocm/optimizer/adam.cu @@ -139,6 +139,7 @@ __global__ void _AdamOptimizer_mode1( template void AdamOptimizerImpl( + hipStream_t stream, const T1* eta, const T2 update_count, const T3* weights, @@ -176,7 +177,7 @@ void AdamOptimizerImpl( // bias correction is applied on learning rate, // weight decay is applied after weight is updated. if (weight_decay_mode == 0) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(_AdamOptimizer_mode0), dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(_AdamOptimizer_mode0), dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, stream, eta, weights, grads, @@ -199,7 +200,7 @@ void AdamOptimizerImpl( N); } else if (weight_decay_mode == 1) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(_AdamOptimizer_mode1), dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(_AdamOptimizer_mode1), dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, stream, eta, weights, grads, @@ -229,6 +230,7 @@ void AdamOptimizerImpl( #define SPECIALIZED_AdamOptimizerImpl(T1, T2, T3, T4, T_GRAD, T_GRAD_NORM, T_MIXED_PRECISION_FP) \ template void AdamOptimizerImpl( \ + hipStream_t stream, \ const T1* eta, \ const T2 update_count, \ const T3* weights, \ diff --git a/orttraining/orttraining/training_ops/rocm/optimizer/lamb.cc b/orttraining/orttraining/training_ops/rocm/optimizer/lamb.cc index f45b5b1a85..0809149ec5 100644 --- a/orttraining/orttraining/training_ops/rocm/optimizer/lamb.cc +++ b/orttraining/orttraining/training_ops/rocm/optimizer/lamb.cc @@ -106,6 +106,7 @@ void check_inputs_and_outputs( template Status copy_inputs_to_outputs( + hipStream_t stream, OpKernelContext* ctx, const int non_grouped_input_count, const int non_grouped_output_count, @@ -144,16 +145,16 @@ Status copy_inputs_to_outputs( w_mixed_precision_new->SetByteOffset(w_mixed_precision->ByteOffset()); if (w_new) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(w, *w_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, w, *w_new)); } if (g_new) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(g, *g_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, g, *g_new)); } - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(m1, m1_new)); - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(m2, m2_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, m1, m1_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, m2, m2_new)); if (w_mixed_precision_new) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(*w_mixed_precision, *w_mixed_precision_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, *w_mixed_precision, *w_mixed_precision_new)); } } @@ -162,6 +163,7 @@ Status copy_inputs_to_outputs( template Status launch_lamb_compute_direction( + hipStream_t stream, const int64_t update_count, const int group_count, const HipT2* p_loss_scale, @@ -210,6 +212,7 @@ Status launch_lamb_compute_direction( do_bias_correction ? onnxruntime::contrib::compute_bias_correction_coefficient(betas[i], update_count) : 1.f; LambComputeDirection( + stream, p_ws[i], p_gs[i], p_m1s[i], @@ -257,6 +260,7 @@ Status launch_lamb_compute_direction( LambStage1 lamb_stage1; launch_multi_tensor_functor( + stream, 2048 * 32, tensor_sizes_in_buckets[key], buckets[key], @@ -287,7 +291,7 @@ Status launch_lamb_reduction( ORT_ENFORCE(group_count == static_cast(p_ds.size())); constexpr int tensor_count_per_group = 4; - + hipStream_t stream = kernel.Stream(); // Bucketize tensor groups by the associated optimizer configuration. // If two tensor groups use different "alpha", they should be put into two distinct buckets. std::vector> buckets; @@ -296,12 +300,14 @@ Status launch_lamb_reduction( for (int i = 0; i < group_count; ++i) { if (tensor_sizes[i] > max_tensor_size) { ORT_RETURN_IF_ERROR(reduce_square_sum( + stream, p_ws[i], p_w_norms[i], tensor_sizes[i], reduction_buffer, reduction_buffer_size)); ORT_RETURN_IF_ERROR(reduce_square_sum( + stream, p_ds[i], p_d_norms[i], tensor_sizes[i], @@ -332,6 +338,7 @@ Status launch_lamb_reduction( typedef LambMultiTensorReductionFunctor TReducer; TReducer reducer; launch_multi_tensor_functor( + stream, 2048 * 32, tensor_sizes_in_buckets, buckets, @@ -346,6 +353,7 @@ Status launch_lamb_reduction( template Status launch_lamb_update( + hipStream_t stream, const int group_count, const HipT1* eta, const float ratio_min, @@ -378,6 +386,7 @@ Status launch_lamb_update( for (int i = 0; i < group_count; ++i) { if (tensor_sizes[i] > max_tensor_size) { LambUpdate( + stream, eta, ratio_min, ratio_max, @@ -419,6 +428,7 @@ Status launch_lamb_update( LambStage2 lamb_stage2; launch_multi_tensor_functor( + stream, 2048 * 32, tensor_sizes_in_bucket, buckets, @@ -493,6 +503,7 @@ Status LambOptimizer::Compute auto update_signal = *update_signal_tensor->template Data(); if (!update_signal) { return copy_inputs_to_outputs( + Stream(), ctx, non_grouped_input_count, non_grouped_output_count, @@ -529,14 +540,14 @@ Status LambOptimizer::Compute // and T2=float. IAllocatorUniquePtr d_norm_buffer = GetScratchBuffer(group_count); HipT2* d_norm_data = reinterpret_cast(d_norm_buffer.get()); - HIP_RETURN_IF_ERROR(hipMemsetAsync(d_norm_data, 0, group_count * sizeof(T2))); + HIP_RETURN_IF_ERROR(hipMemsetAsync(d_norm_data, 0, group_count * sizeof(T2), Stream())); // Allocate buffer for reduction computation of weight tensor. // The i-th weight's norm is stored at the i-th element. // We reduce type T2 tensor to type T2 scalar. An example is that T2=float. IAllocatorUniquePtr w_norm_buffer = GetScratchBuffer(group_count); HipT2* w_norm_data = reinterpret_cast(w_norm_buffer.get()); - HIP_RETURN_IF_ERROR(hipMemsetAsync(w_norm_data, 0, group_count * sizeof(T2))); + HIP_RETURN_IF_ERROR(hipMemsetAsync(w_norm_data, 0, group_count * sizeof(T2), Stream())); // Find the max size of updated weight tensors. int max_tensor_size = 0; @@ -642,6 +653,7 @@ Status LambOptimizer::Compute } ORT_RETURN_IF_ERROR(launch_lamb_compute_direction( + Stream(), step_data ? *step_data : 0, group_count, loss_scale_data, @@ -665,6 +677,7 @@ Status LambOptimizer::Compute reduction_buffer_size)); ORT_RETURN_IF_ERROR(launch_lamb_update( + Stream(), group_count, eta_data, ratio_min_, diff --git a/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cc b/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cc index c054b96816..6fae7135b7 100644 --- a/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cc +++ b/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cc @@ -44,7 +44,7 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { // Allocate output tensor. Tensor* output = ctx->Output(0, {}); HipTOut* p_output = reinterpret_cast(output->template MutableData()); - HIP_RETURN_IF_ERROR(hipMemsetAsync(p_output, 0, sizeof(HipTOut))); + HIP_RETURN_IF_ERROR(hipMemsetAsync(p_output, 0, sizeof(HipTOut), Stream())); // bool deterministic = ctx->GetUseDeterministicCompute(); bool deterministic = true; @@ -55,11 +55,11 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { // Check if all values are finite and write true to deviceOutput. // Otherwise, false will be written. launch_multi_tensor_functor<1, TFunctor>( - 2048 * 32, tensor_sizes, grouped_tensor_pointers, functor, p_output); + Stream(), 2048 * 32, tensor_sizes, grouped_tensor_pointers, functor, p_output); // *p_output is the squared sum of all elements. // Let's take a sqrt to get the actual L2-norm. - ScalarSqrt(p_output, p_output); + ScalarSqrt(Stream(), p_output, p_output); } else { // alternate path only for deterministic compute .. typedef AccumulationType_t HipTAcc; @@ -81,7 +81,7 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { // buffer for final output and square norms of each tensor auto results_buffer = GetScratchBuffer(1 + total_tensor_count); - HIP_RETURN_IF_ERROR(hipMemsetAsync(results_buffer.get(), 0, sizeof(HipTAcc) * (1 + total_tensor_count))); + HIP_RETURN_IF_ERROR(hipMemsetAsync(results_buffer.get(), 0, sizeof(HipTAcc) * (1 + total_tensor_count), Stream())); HipTAcc* p_global_sqnorm = results_buffer.get(); HipTAcc* p_tensor_sqnorm = p_global_sqnorm + 1; @@ -90,11 +90,11 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { for (int i = 0; i < total_tensor_count; ++i) { HipTIn* p_tensor_i = reinterpret_cast(grouped_tensor_pointers[i][0]); ORT_RETURN_IF_ERROR(reduce_square_sum( - p_tensor_i, p_tensor_sqnorm + i, tensor_sizes[i], reduction_buffer.get(), reduction_buffer_size)); + Stream(), p_tensor_i, p_tensor_sqnorm + i, tensor_sizes[i], reduction_buffer.get(), reduction_buffer_size)); } ORT_RETURN_IF_ERROR(reduce_sum( - p_tensor_sqnorm, p_global_sqnorm, total_tensor_count, reduction_buffer.get(), reduction_buffer_size)); - ScalarSqrt(p_global_sqnorm, p_output); + Stream(), p_tensor_sqnorm, p_global_sqnorm, total_tensor_count, reduction_buffer.get(), reduction_buffer_size)); + ScalarSqrt(Stream(), p_global_sqnorm, p_output); } return Status::OK(); diff --git a/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc b/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc index 2bafe92209..c628efb013 100644 --- a/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc +++ b/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc @@ -47,7 +47,7 @@ Status ReduceKernel::ComputeImplEx(OpKernelContext* ctx, miope // empty axes and no-op if (axes.empty() && noop_with_empty_axes_) { auto* Y = ctx->Output(0, X->Shape()); - HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -80,7 +80,7 @@ Status ReduceKernel::ComputeImplExOutput(0, X->Shape()); - HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -107,14 +107,14 @@ Status ReduceKernel::ComputeImplExtemplate MutableData() != X->template Data()) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), input_count * sizeof(int32_t), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), input_count * sizeof(int32_t), hipMemcpyDeviceToDevice, Stream())); } return Status::OK(); } // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. // Therefore zeroing out the memory is required - HIP_RETURN_IF_ERROR(hipMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes())); + HIP_RETURN_IF_ERROR(hipMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes(), Stream())); size_t indices_bytes = 0; size_t workspace_bytes = 0; @@ -124,7 +124,7 @@ Status ReduceKernel::ComputeImplEx temp_X = GetScratchBuffer(input_count); - Impl_Cast(reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); + Impl_Cast(Stream(), reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES)); ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X)); @@ -150,7 +150,7 @@ Status ReduceKernel::ComputeImplEx(temp_Y.get(), Y->template MutableData(), output_count); + Impl_Cast(Stream(), temp_Y.get(), Y->template MutableData(), output_count); return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/rocm/tensor/gather_grad.cc b/orttraining/orttraining/training_ops/rocm/tensor/gather_grad.cc index ef1fa2121e..d4ff6c37d8 100644 --- a/orttraining/orttraining/training_ops/rocm/tensor/gather_grad.cc +++ b/orttraining/orttraining/training_ops/rocm/tensor/gather_grad.cc @@ -93,7 +93,7 @@ Status GatherGrad::ComputeInternal(OpKernelContext* context) const { const Tensor* grad = context->Input(2); Tensor* output = context->Output(0, data_shape); - HIP_RETURN_IF_ERROR(hipMemsetAsync(output->MutableDataRaw(), 0, output->SizeInBytes())); + HIP_RETURN_IF_ERROR(hipMemsetAsync(output->MutableDataRaw(), 0, output->SizeInBytes(), Stream())); MLDataType T_type = grad->DataType(); MLDataType Tin_type = indices->DataType(); diff --git a/orttraining/orttraining/training_ops/rocm/tensor/gather_grad_impl.cu b/orttraining/orttraining/training_ops/rocm/tensor/gather_grad_impl.cu index 14599791b4..7263591b48 100644 --- a/orttraining/orttraining/training_ops/rocm/tensor/gather_grad_impl.cu +++ b/orttraining/orttraining/training_ops/rocm/tensor/gather_grad_impl.cu @@ -131,12 +131,13 @@ void GatherGradImpl( ) { // allocate intermediate buffers auto original_indices = rocm_kernel.template GetScratchBuffer(num_indices); + hipStream_t stream = rocm_kernel.Stream(); // initialize original_indices with [0, num_indices) { const auto blocks_per_grid = CeilDiv(num_indices, GridDim::maxThreadsPerBlock); hipcub::CountingInputIterator counting_input(Tin{}); - hipLaunchKernelGGL(_Iota, dim3(blocks_per_grid), dim3(GridDim::maxThreadsPerBlock), 0, 0, + hipLaunchKernelGGL(_Iota, dim3(blocks_per_grid), dim3(GridDim::maxThreadsPerBlock), 0, stream, counting_input, num_indices, original_indices.get()); } @@ -149,7 +150,7 @@ void GatherGradImpl( nullptr, sort_temp_storage_size_bytes, indices_data, indices_data_sorted.get(), original_indices.get(), original_indices_sorted.get(), - num_indices)); + num_indices, 0, sizeof(Tin)*8, stream)); auto sort_temp_storage = rocm_kernel.GetScratchBuffer(sort_temp_storage_size_bytes); @@ -157,13 +158,13 @@ void GatherGradImpl( sort_temp_storage.get(), sort_temp_storage_size_bytes, indices_data, indices_data_sorted.get(), original_indices.get(), original_indices_sorted.get(), - num_indices)); + num_indices, 0, sizeof(Tin)*8, stream)); dim3 block(GPU_WARP_SIZE, 4); dim3 grid(CeilDiv(num_indices, 4), CeilDiv(stride, GridDim::maxElementsPerThread * GPU_WARP_SIZE)); if (param_itrs == 1) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherAxis0GradImpl), dim3(grid), dim3(block), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherAxis0GradImpl), dim3(grid), dim3(block), 0, stream, indices_data_sorted.get(), original_indices_sorted.get(), grad_data, @@ -172,7 +173,7 @@ void GatherGradImpl( num_inputs, stride); } else { - hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherGradImpl), dim3(grid), dim3(block), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherGradImpl), dim3(grid), dim3(block), 0, stream, indices_data_sorted.get(), original_indices_sorted.get(), grad_data, diff --git a/orttraining/orttraining/training_ops/rocm/tensor/gather_nd_grad_impl.cu b/orttraining/orttraining/training_ops/rocm/tensor/gather_nd_grad_impl.cu index 10270bc7c4..8f924df170 100644 --- a/orttraining/orttraining/training_ops/rocm/tensor/gather_nd_grad_impl.cu +++ b/orttraining/orttraining/training_ops/rocm/tensor/gather_nd_grad_impl.cu @@ -24,18 +24,19 @@ __global__ void _GatherNDGradKernel( template void GatherNDGradImpl( + hipStream_t stream, const size_t num_slices, const void* update_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data) { const auto blocks_per_grid = CeilDiv(num_slices * slice_size, GridDim::maxThreadsPerBlock); - hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherNDGradKernel), dim3(blocks_per_grid), dim3(GridDim::maxThreadsPerBlock), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherNDGradKernel), dim3(blocks_per_grid), dim3(GridDim::maxThreadsPerBlock), 0, stream, num_slices, static_cast(update_data), static_cast(output_data), slice_size, input_slice_offsets_data); } #define SPECIALIZED_GRAD_IMPL(T) \ - template void GatherNDGradImpl(const size_t num_slices, const void* update_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data) + template void GatherNDGradImpl(hipStream_t stream, const size_t num_slices, const void* update_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data) SPECIALIZED_GRAD_IMPL(float); SPECIALIZED_GRAD_IMPL(half);