Support to allow user to specify compute stream per session (#3723)

* Support to allow user to specify compute stream per session

Create computation cuda stream explicitly rather than use default legacy stream or per-thread default stream.

remove some redudant cudaStreamSynchronize

fix gpt2 model test failures

don't use default stream in nccl either.

add stream schronization in OnRunEnd()

using cub::DeviceScan::InclusiveSum which can be called with stream specified.

fix topK failure due to latest rebase

fix tensorrt

support user specified stream

add user_stream support in tensorrt EP

use same stream for both tensort and CUDA EP.

fix ScatterND

specify stream for adasum and p2p kernels.

fix loop

fix CApiTest.custom_op_handler

fix CApiTest.varied_input_custom_op_handler

change for cudaMemcpyFromSymbol

improve provider options for user specified compute stream

* add changes for ROCM EP

* fix GatherGrad UT for ROCM EP

* clean code and fix NonMaxSuppression

* use default stream for ROCM now

* fix CApiTest.custom_op_handler:OrtFormatCustomOpTests.ConvertOnnxModelToOrt

* fix tensorrt ut: CApiTest.io_binding_cuda

Co-authored-by: Weixing Zhang <wezhan@microsoft.com>
This commit is contained in:
Weixing Zhang 2021-02-05 15:48:18 -08:00 committed by GitHub
parent 973c3917a6
commit 299ace0759
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
320 changed files with 1876 additions and 1109 deletions

View file

@ -1198,9 +1198,9 @@ if (onnxruntime_USE_CUDA)
endif()
endif()
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --default-stream legacy")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
if (NOT WIN32)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --expt-relaxed-constexpr --compiler-options -fPIC")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --compiler-options -fPIC")
endif()
# Options passed to cudafe
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=bad_friend_decl\"")

View file

@ -165,6 +165,9 @@ class IExecutionProvider {
*/
virtual common::Status OnSessionInitializationEnd() { return Status::OK(); }
virtual common::Status SetComputeStream(void*) { return Status::OK(); }
virtual void* GetComputeStream() const { return nullptr; }
void InsertAllocator(AllocatorPtr allocator);
void ReplaceAllocator(AllocatorPtr allocator);
// TODO: temparary sulotion, need to unify the interface in EP and AllocatorManager

View file

@ -266,8 +266,19 @@ typedef struct OrtCUDAProviderOptions {
size_t cuda_mem_limit; // default cuda memory limitation to maximum finite value of size_t.
int arena_extend_strategy; // default area extend strategy to KNextPowerOfTwo.
int do_copy_in_default_stream;
int has_user_compute_stream;
void* user_compute_stream;
} OrtCUDAProviderOptions;
/// <summary>
/// Options for the TensorRT provider that are passed to SessionOptionsAppendExecutionProvider_TensorRT
/// </summary>
typedef struct OrtTensorRTProviderOptions {
int device_id;
int has_user_compute_stream;
void* user_compute_stream;
} OrtTensorRTProviderOptions;
/// <summary>
/// Options for the OpenVINO provider that are passed to SessionOptionsAppendExecutionProvider_OpenVINO
/// </summary>
@ -1146,6 +1157,12 @@ struct OrtApi {
*/
ORT_API2_STATUS(ModelMetadataGetGraphDescription, _In_ const OrtModelMetadata* model_metadata,
_Inout_ OrtAllocator* allocator, _Outptr_ char** value);
/**
* Append TensorRT execution provider to the session options
* If TensorRT is not available (due to a non TensorRT enabled build), this function will return failure.
*/
ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_TensorRT,
_In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options);
};
/*

View file

@ -326,6 +326,7 @@ struct SessionOptions : Base<OrtSessionOptions> {
SessionOptions& AppendExecutionProvider_CUDA(const OrtCUDAProviderOptions& provider_options);
SessionOptions& AppendExecutionProvider_OpenVINO(const OrtOpenVINOProviderOptions& provider_options);
SessionOptions& AppendExecutionProvider_TensorRT(const OrtTensorRTProviderOptions& provider_options);
};
struct ModelMetadata : Base<OrtModelMetadata> {

View file

@ -490,6 +490,11 @@ inline SessionOptions& SessionOptions::AppendExecutionProvider_CUDA(const OrtCUD
return *this;
}
inline SessionOptions& SessionOptions::AppendExecutionProvider_TensorRT(const OrtTensorRTProviderOptions& provider_options) {
ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_TensorRT(p_, &provider_options));
return *this;
}
inline SessionOptions& SessionOptions::AppendExecutionProvider_OpenVINO(const OrtOpenVINOProviderOptions& provider_options) {
ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_OpenVINO(p_, &provider_options));
return *this;

View file

@ -29,6 +29,7 @@ namespace cuda {
ORT_RETURN_IF_ERROR(UnaryElementwise::Prepare(context, &p)); \
Ctx##x func_ctx = MakeFuncCtx(); \
Impl_##x<typename ToCudaType<T>::MappedType>( \
Stream(), \
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(p.input_tensor->template Data<T>()), \
reinterpret_cast<typename ToCudaType<T>::MappedType*>(p.output_tensor->template MutableData<T>()), \
&func_ctx, p.output_tensor->Shape().Size()); \

View file

@ -45,14 +45,15 @@ struct OP_Gelu : public CtxGelu {
#define UNARY_ACTIVATION_IMPL(name) \
UNARY_ACTIVATION_IMPL_DECLARATION(name) { \
UnaryElementWiseImpl(input_data, \
UnaryElementWiseImpl(stream, \
input_data, \
output_data, \
*reinterpret_cast<const OP_##name<T>*>(func_ctx), \
count); \
}
#define SPECIALIZED_UNARY_ACTIVATION_IMPL(name, T) \
template void Impl_##name<T>(const T* input_data, T* output_data, const Ctx##name* func_ctx, size_t count);
template void Impl_##name<T>(cudaStream_t stream, const T* input_data, T* output_data, const Ctx##name* func_ctx, size_t count);
#define SPECIALIZED_UNARY_ACTIVATIONL_HFD(name) \
SPECIALIZED_UNARY_ACTIVATION_IMPL(name, half) \

View file

@ -22,6 +22,7 @@ typedef onnxruntime::cuda::CtxNull CtxGelu;
#define UNARY_ACTIVATION_IMPL_DECLARATION(name) \
template <typename T> \
void Impl_##name( \
cudaStream_t stream, \
const T* input_data, \
T* output_data, \
const Ctx##name* func_ctx, \

View file

@ -88,6 +88,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
auto temp_buffer = GetScratchBuffer<void>(workSpaceSize);
if (!LaunchAttentionKernel(
device_prop,
Stream(),
reinterpret_cast<const CudaT*>(gemm_buffer.get()),
nullptr == mask_index ? nullptr : mask_index->template Data<int>(),
nullptr == mask_index ? nullptr : &(mask_index->Shape().GetDims()),

View file

@ -148,6 +148,7 @@ bool QkvToContext(
bool LaunchAttentionKernel(
const cudaDeviceProp& prop,
cudaStream_t stream,
const void* input,
const int* mask_index,
const std::vector<int64_t>* mask_index_dims,
@ -163,9 +164,6 @@ bool LaunchAttentionKernel(
int past_sequence_length,
const void* past,
void* present) {
// use default stream
const cudaStream_t stream = nullptr;
if (element_size == 2) {
return QkvToContext(prop, cublas, stream,
batch_size, sequence_length, num_heads, head_size, element_size,

View file

@ -20,6 +20,7 @@ size_t GetAttentionWorkspaceSize(
bool LaunchAttentionKernel(
const cudaDeviceProp& prop, // Device Properties
cudaStream_t stream, // cuda stream
const void* input, // Input tensor
const int* mask_index, // Attention mask raw data or index (end position of each sequence, or end positions and start positions). NULL means no mask.
const std::vector<int64_t>* mask_index_dims, // Mask index shape

View file

@ -61,6 +61,7 @@ Status EmbedLayerNorm<T>::ComputeInternal(OpKernelContext* context) const {
size_t element_size = sizeof(T);
if (!LaunchEmbedLayerNormKernel(
Stream(),
output->template MutableData<T>(),
mask_index->template MutableData<int32_t>(),
input_ids->template Data<int32_t>(),

View file

@ -173,6 +173,7 @@ bool EmbedSkipLayerNorm(
}
bool LaunchEmbedLayerNormKernel(
cudaStream_t stream,
void* output,
void* mask_index,
const int* input_ids,
@ -188,10 +189,8 @@ bool LaunchEmbedLayerNormKernel(
int batch_size,
int sequence_length,
const size_t element_size) {
const cudaStream_t stream = nullptr; // default stream
if (nullptr == input_mask) {
if (!CUDA_CALL(cudaMemsetAsync(mask_index, 0, sizeof(int) * batch_size)))
if (!CUDA_CALL(cudaMemsetAsync(mask_index, 0, sizeof(int) * batch_size, stream)))
return false;
} else if (!ComputeMaskIndex(stream, sequence_length, batch_size, input_mask, static_cast<int*>(mask_index))) {
return false;

View file

@ -6,7 +6,8 @@ namespace onnxruntime {
namespace contrib {
namespace cuda {
bool LaunchEmbedLayerNormKernel(void* output, // output tensor
bool LaunchEmbedLayerNormKernel(cudaStream_t stream,
void* output, // output tensor
void* mask_index, // output mask index
const int* input_ids, // input word IDs
const int* segment_ids, // input segment IDs

View file

@ -47,7 +47,7 @@ Status FastGelu<T>::ComputeInternal(OpKernelContext* context) const {
int64_t bias_length = (nullptr == bias) ? 0 : bias->Shape().Size();
typedef typename ToCudaType<T>::MappedType CudaT;
if (!LaunchFastGeluKernel<CudaT>(GetDeviceProp(),
nullptr,
Stream(),
static_cast<int>(input_length),
static_cast<int>(bias_length),
reinterpret_cast<const CudaT*>(input->template Data<T>()),

View file

@ -111,6 +111,7 @@ Status LongformerAttention<T>::ComputeInternal(OpKernelContext* context) const {
auto workspace_buffer = GetScratchBuffer<void>(workSpaceSize);
if (!LaunchLongformerAttentionKernel(
device_prop,
Stream(),
reinterpret_cast<const CudaT*>(gemm_buffer.get()),
reinterpret_cast<const CudaT*>(mask->template Data<T>()),
reinterpret_cast<const CudaT*>(global_gemm_buffer.get()),

View file

@ -814,6 +814,7 @@ bool LongformerQkvToContext(
bool LaunchLongformerAttentionKernel(
const cudaDeviceProp& prop,
cudaStream_t stream,
const void* input,
const void* attention_mask,
const void* global_input,
@ -828,9 +829,6 @@ bool LaunchLongformerAttentionKernel(
void* workspace,
cublasHandle_t& cublas,
const size_t element_size) {
// use default stream
const cudaStream_t stream = nullptr;
if (element_size == 2) {
return LongformerQkvToContext(prop, cublas, stream,
batch_size, sequence_length, num_heads, head_size, window, element_size,

View file

@ -18,6 +18,7 @@ size_t GetLongformerAttentionWorkspaceSize(
bool LaunchLongformerAttentionKernel(
const cudaDeviceProp& device_prop, // Device Properties
cudaStream_t stream, // CUDA stream
const void* input, // Input tensor
const void* attention_mask, // Attention mask with shape (B, S)
const void* global_input, // Global attention input, or nullptr when max_num_global == 0.

View file

@ -93,6 +93,7 @@ Status SkipLayerNorm<T>::ComputeInternal(OpKernelContext* ctx) const {
size_t element_size = sizeof(T);
if (!LaunchSkipLayerNormKernel(
Stream(),
output->template MutableData<T>(),
input->template Data<T>(),
skip->template Data<T>(),

View file

@ -100,6 +100,7 @@ bool ComputeSkipLayerNorm(
}
bool LaunchSkipLayerNormKernel(
cudaStream_t stream,
void* output,
const void* input,
const void* skip,
@ -110,9 +111,6 @@ bool LaunchSkipLayerNormKernel(
int hidden_size,
int element_count,
size_t element_size) {
// use default stream
const cudaStream_t stream = nullptr;
if (element_size == 2) {
return ComputeSkipLayerNorm(
stream,

View file

@ -8,6 +8,7 @@ namespace contrib {
namespace cuda {
bool LaunchSkipLayerNormKernel(
cudaStream_t stream,
void* output, // output tensor
const void* input, // input tensor
const void* skip, // skip tensor

View file

@ -90,7 +90,7 @@ class FusedConv : public onnxruntime::cuda::Conv<T> {
Base::s_.y_data, beta, Base::s_.y_tensor, Base::s_.y_data));
}
if (Base::s_.post_slicing_required) {
onnxruntime::cuda::SliceOutUnwantedOutputSection(Base::s_.y_data, Base::s_.y_dims_with_adjusted_pads, Base::s_.Y->MutableDataRaw(),
onnxruntime::cuda::SliceOutUnwantedOutputSection(this->Stream(), Base::s_.y_data, Base::s_.y_dims_with_adjusted_pads, Base::s_.Y->MutableDataRaw(),
Base::s_.y_dims, Base::s_.slice_starts, Base::s_.slice_ends, Base::s_.slice_axes, Base::s_.element_size);
}
return Status::OK();

View file

@ -35,22 +35,24 @@ ONNX_OPERATOR_KERNEL_EX(
namespace inverse_internal {
template <typename T>
Status ComputeMatrixOffsets(T* workspace_data, size_t num_batches, size_t rows, IAllocatorUniquePtr<T*>& matrix_ptrs) {
Status ComputeMatrixOffsets(cudaStream_t stream, T* workspace_data, size_t num_batches, size_t rows, IAllocatorUniquePtr<T*>& matrix_ptrs) {
std::vector<T*> cuda_ptrs;
const size_t matrix_size = rows * rows;
for (size_t i = 0; i < num_batches; ++i) {
cuda_ptrs.push_back(workspace_data);
workspace_data += matrix_size;
}
CUDA_RETURN_IF_ERROR(cudaMemcpy(matrix_ptrs.get(), cuda_ptrs.data(), sizeof(T*) * num_batches,
cudaMemcpyHostToDevice));
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(matrix_ptrs.get(), cuda_ptrs.data(), sizeof(T*) * num_batches,
cudaMemcpyHostToDevice, stream));
return Status::OK();
}
Status CheckForSingularity(const IAllocatorUniquePtr<int>& info, const std::unique_ptr<int[]>& info_cpu, size_t num_batches) {
Status CheckForSingularity(cudaStream_t stream, const IAllocatorUniquePtr<int>& info, const std::unique_ptr<int[]>& info_cpu, size_t num_batches) {
// Let's check if any of the info values is non-zero
CUDA_RETURN_IF_ERROR(cudaMemcpy(info_cpu.get(), info.get(), sizeof(int) * num_batches,
cudaMemcpyDeviceToHost));
// cudaMemcpyAsync from device memory to pageable host memory will return only once the copy has completed.
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(info_cpu.get(), info.get(), sizeof(int) * num_batches,
cudaMemcpyDeviceToHost, stream));
for (size_t i = 0; i < num_batches; ++i) {
if (info_cpu[i] != 0) {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Matrix is singular at batch:", i);
@ -63,7 +65,7 @@ Status CheckForSingularity(const IAllocatorUniquePtr<int>& info, const std::uniq
template <typename T>
struct Inverse::ComputeImpl {
Status operator()(Inverse::CublasHandle cublas_h, const Inverse* inst, const Tensor& input, Tensor& output,
Status operator()(cudaStream_t stream, Inverse::CublasHandle cublas_h, const Inverse* inst, const Tensor& input, Tensor& output,
const IAllocatorUniquePtr<int>& info, const IAllocatorUniquePtr<int>& pivots,
size_t num_batches, size_t rows) const {
using namespace onnxruntime::cuda;
@ -79,52 +81,52 @@ struct Inverse::ComputeImpl {
IAllocatorUniquePtr<float> input_workspace = inst->GetScratchBuffer<float>(input_count);
if (std::is_same<T, MLFloat16>::value) {
// Convert from MLFloat16(half) to float
Impl_Cast<CudaT, float>(reinterpret_cast<const CudaT*>(input.Data<MLFloat16>()), input_workspace.get(), input_count);
Impl_Cast<CudaT, float>(stream, reinterpret_cast<const CudaT*>(input.Data<MLFloat16>()), input_workspace.get(), input_count);
} else {
CUDA_RETURN_IF_ERROR(cudaMemcpy(input_workspace.get(), input.Data<float>(), sizeof(float) * input_count,
cudaMemcpyDeviceToDevice));
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_workspace.get(), input.Data<float>(), sizeof(float) * input_count,
cudaMemcpyDeviceToDevice, stream));
}
IAllocatorUniquePtr<float*> matrix_ptrs = inst->GetScratchBuffer<float*>(n_batches);
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<float>(input_workspace.get(), num_batches, rows, matrix_ptrs));
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<float>(stream, input_workspace.get(), num_batches, rows, matrix_ptrs));
// Do LU factorization
CUBLAS_RETURN_IF_ERROR(cublasSgetrfBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), info.get(), n_batches));
ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches));
ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches));
// Need to compute ptrs for output buffers
// Output for MLFloat
IAllocatorUniquePtr<float*> output_ptrs = inst->GetScratchBuffer<float*>(n_batches);
if (std::is_same<T, MLFloat16>::value) {
IAllocatorUniquePtr<float> ml_float_output = inst->GetScratchBuffer<float>(input_count);
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<float>(ml_float_output.get(), num_batches, rows, output_ptrs));
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<float>(stream, ml_float_output.get(), num_batches, rows, output_ptrs));
// Do the inverse
CUBLAS_RETURN_IF_ERROR(cublasSgetriBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), output_ptrs.get(), dim, info.get(), n_batches));
ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches));
ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches));
// Copy the result to output with casting
Impl_Cast<float, CudaT>(ml_float_output.get(), reinterpret_cast<CudaT*>(output.MutableData<MLFloat16>()), input_count);
Impl_Cast<float, CudaT>(stream, ml_float_output.get(), reinterpret_cast<CudaT*>(output.MutableData<MLFloat16>()), input_count);
// We are done here
} else {
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<float>(output.MutableData<float>(), num_batches, rows, output_ptrs));
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<float>(stream, output.MutableData<float>(), num_batches, rows, output_ptrs));
// Do the inverse
CUBLAS_RETURN_IF_ERROR(cublasSgetriBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), output_ptrs.get(), dim, info.get(), n_batches));
ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches));
ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches));
// We are done here
}
} else if (std::is_same<T, double>::value) {
IAllocatorUniquePtr<double> input_workspace = inst->GetScratchBuffer<double>(static_cast<int>(input_count));
CUDA_RETURN_IF_ERROR(cudaMemcpy(input_workspace.get(), input.Data<double>(), sizeof(double) * input_count,
cudaMemcpyDeviceToDevice));
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_workspace.get(), input.Data<double>(), sizeof(double) * input_count,
cudaMemcpyDeviceToDevice, stream));
IAllocatorUniquePtr<double*> matrix_ptrs = inst->GetScratchBuffer<double*>(n_batches);
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<double>(input_workspace.get(), num_batches, rows, matrix_ptrs));
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<double>(stream, input_workspace.get(), num_batches, rows, matrix_ptrs));
// Do LU factorization
CUBLAS_RETURN_IF_ERROR(cublasDgetrfBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), info.get(), n_batches));
ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches));
ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches));
// Need to compute ptrs for output buffers
IAllocatorUniquePtr<double*> output_ptrs = inst->GetScratchBuffer<double*>(n_batches);
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<double>(output.MutableData<double>(), num_batches, rows, output_ptrs));
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<double>(stream, output.MutableData<double>(), num_batches, rows, output_ptrs));
CUBLAS_RETURN_IF_ERROR(cublasDgetriBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), output_ptrs.get(), dim, info.get(), n_batches));
ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches));
ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches));
// We are done here
} else {
ORT_THROW("Type is not supported");
@ -148,11 +150,11 @@ Status Inverse::ComputeInternal(OpKernelContext* ctx) const {
}
IAllocatorUniquePtr<int> info = GetScratchBuffer<int>(num_batches);
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(info.get(), 0, num_batches));
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(info.get(), 0, num_batches, Stream()));
IAllocatorUniquePtr<int> pivots = GetScratchBuffer<int>(rows * num_batches);
utils::MLTypeCallDispatcherRet<Status, ComputeImpl, float, double, MLFloat16> t_disp(input->GetElementType());
return t_disp.Invoke(Base::CublasHandle(), this, *input, *output, info, pivots, num_batches, rows);
return t_disp.Invoke(Stream(), Base::CublasHandle(), this, *input, *output, info, pivots, num_batches, rows);
}
} // namespace cuda

View file

@ -98,7 +98,7 @@ Status LayerNorm<T, U, simplified>::ComputeInternal(OpKernelContext* ctx) const
inv_var_data = reinterpret_cast<CudaU*>(var->template MutableData<U>());
}
HostApplyLayerNorm<CudaT, CudaU, simplified>(GetDeviceProp(), Y_data, mean_data, inv_var_data, X_data, n1, n2, epsilon_, scale_data, bias_data);
HostApplyLayerNorm<CudaT, CudaU, simplified>(GetDeviceProp(), Stream(), Y_data, mean_data, inv_var_data, X_data, n1, n2, epsilon_, scale_data, bias_data);
return Status::OK();
}

View file

@ -350,6 +350,7 @@ __global__ void cuApplyLayerNorm(
template <typename T, typename U, bool simplified>
void HostApplyLayerNorm(
const cudaDeviceProp& prop,
cudaStream_t stream,
T* output,
U* mean,
U* invvar,
@ -367,7 +368,7 @@ void HostApplyLayerNorm(
const dim3 blocks(1, std::min<unsigned int>(n1, maxGridY), 1);
int nshared =
threads.y > 1 ? threads.y * sizeof(U) + (threads.y / 2) * sizeof(U) : 0;
cuApplyLayerNorm<T, U, simplified><<<blocks, threads, nshared, 0>>>(
cuApplyLayerNorm<T, U, simplified><<<blocks, threads, nshared, stream>>>(
output,
mean,
invvar,
@ -378,7 +379,7 @@ void HostApplyLayerNorm(
}
#define LAYERNORM_LINEAR_IMPL(T, U, simplified) \
template void HostApplyLayerNorm<T, U, simplified>(const cudaDeviceProp& prop, T* output, U* mean, U* invvar, const T* input, int n1, int n2, \
template void HostApplyLayerNorm<T, U, simplified>(const cudaDeviceProp& prop, cudaStream_t stream, T* output, U* mean, U* invvar, const T* input, int n1, int n2, \
double epsilon, const T* gamma, const T* beta);
LAYERNORM_LINEAR_IMPL(float, float, true)

View file

@ -32,6 +32,7 @@ namespace cuda {
template <typename T, typename U, bool simplified>
void HostApplyLayerNorm(
const cudaDeviceProp& prop,
cudaStream_t stream,
T* output,
U* mean,
U* invvar,

View file

@ -15,6 +15,7 @@ namespace cuda {
template <typename T>
void DispatchBiasSoftmaxForwardImpl(
cudaStream_t stream,
Tensor* output_tensor,
const Tensor* input_tensor,
const Tensor* input_bias_tensor,
@ -25,6 +26,7 @@ void DispatchBiasSoftmaxForwardImpl(
template <typename T>
void DispatchBiasSoftMaxForwardViaDnnLibraryImpl(
cudaStream_t stream,
cudnnHandle_t cudaDnnHandle,
int element_count,
int batch_count,
@ -64,12 +66,12 @@ Status BiasSoftmax::ComputeInternal(OpKernelContext* ctx) const {
// expect thread blocks can fill SM at high occupancy without overflowing registers
utils::MLTypeCallDispatcher<DispatchBiasSoftmaxForward, double, float, MLFloat16>
t_disp(X->GetElementType());
t_disp.Invoke(Y, X, B, D, N, D, broadcast_size);
t_disp.Invoke(Stream(), Y, X, B, D, N, D, broadcast_size);
} else {
// need to fallback to add kernel + CUDA DNN library softmax call :/
utils::MLTypeCallDispatcher<DispatchBiasSoftMaxForwardViaDnnLibrary, double, float, MLFloat16>
t_disp(X->GetElementType());
t_disp.Invoke(CudnnHandle(), D, N, broadcast_axis, softmax_axis, X_shape, X, B_shape, B, Y);
t_disp.Invoke(Stream(), CudnnHandle(), D, N, broadcast_axis, softmax_axis, X_shape, X, B_shape, B, Y);
}
return Status::OK();
@ -77,6 +79,7 @@ Status BiasSoftmax::ComputeInternal(OpKernelContext* ctx) const {
template <typename T>
void DispatchBiasSoftmaxForward<T>::operator()(
cudaStream_t stream,
Tensor* output,
const Tensor* input,
const Tensor* input_bias,
@ -85,6 +88,7 @@ void DispatchBiasSoftmaxForward<T>::operator()(
int batch_stride,
int bias_broadcast_size_per_batch) {
DispatchBiasSoftmaxForwardImpl<T>(
stream,
output,
input,
input_bias,
@ -96,6 +100,7 @@ void DispatchBiasSoftmaxForward<T>::operator()(
template <typename T>
void DispatchBiasSoftMaxForwardViaDnnLibrary<T>::operator()(
cudaStream_t stream,
cudnnHandle_t cudaDnnHandle,
int element_count,
int batch_count,
@ -107,6 +112,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibrary<T>::operator()(
const onnxruntime::Tensor* B,
onnxruntime::Tensor* Y) {
DispatchBiasSoftMaxForwardViaDnnLibraryImpl<T>(
stream,
cudaDnnHandle,
element_count,
batch_count,

View file

@ -13,6 +13,7 @@ namespace cuda {
template <typename T>
struct DispatchBiasSoftmaxForward {
void operator()(
cudaStream_t stream,
Tensor* output,
const Tensor* input,
const Tensor* input_bias,
@ -25,6 +26,7 @@ struct DispatchBiasSoftmaxForward {
template <typename T>
struct DispatchBiasSoftMaxForwardViaDnnLibrary {
void operator()(
cudaStream_t stream,
cudnnHandle_t cudaDnnHandle,
int element_count,
int batch_count,

View file

@ -127,6 +127,7 @@ __global__ void BiasSoftmaxWarpForward(
template <typename T>
void DispatchBiasSoftmaxForwardImpl(
cudaStream_t stream,
Tensor* output_tensor,
const Tensor* input_tensor,
const Tensor* input_bias_tensor,
@ -167,47 +168,47 @@ void DispatchBiasSoftmaxForwardImpl(
switch (log2_elements) {
case 0: // 1
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 0>
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 1: // 2
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 1>
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 2: // 4
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 2>
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 3: // 8
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 3>
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 4: // 16
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 4>
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 5: // 32
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 5>
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 6: // 64
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 6>
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 7: // 128
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 7>
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 8: // 256
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 8>
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 9: // 512
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 9>
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 10: // 1024
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 10>
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
default:
break;
@ -216,6 +217,7 @@ void DispatchBiasSoftmaxForwardImpl(
#define SPECIALIZED_BIAS_SOFTMAX_IMPL(T) \
template void DispatchBiasSoftmaxForwardImpl<T>( \
cudaStream_t stream, \
Tensor * output_tensor, \
const Tensor* input_tensor, \
const Tensor* input_bias_tensor, \
@ -232,6 +234,7 @@ SPECIALIZED_BIAS_SOFTMAX_IMPL(MLFloat16)
// note: This is an unhappy path! There is no performance benefit for the fusion.
template <typename T>
void DispatchBiasSoftMaxForwardViaDnnLibraryImpl(
cudaStream_t stream,
cudnnHandle_t cudaDnnHandle,
int element_count,
int batch_count,
@ -278,6 +281,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibraryImpl(
// invoke elementwise add with broadcast kernel
::onnxruntime::cuda::BinaryElementWiseImpl(
stream,
(int32_t)X_shape.NumDimensions(),
&lhs_padded_strides,
X_data,
@ -311,6 +315,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibraryImpl(
#define SPECIALIZED_BIAS_SOFTMAX_IMPL_VIA_DNN(T) \
template void DispatchBiasSoftMaxForwardViaDnnLibraryImpl<T>( \
cudaStream_t stream, \
cudnnHandle_t cudaDnnHandle, \
int element_count, \
int batch_count, \

View file

@ -25,6 +25,7 @@ namespace cuda {
BinaryElementwisePreparation prepare; \
ORT_RETURN_IF_ERROR(Prepare(context, &prepare)); \
Impl_##x<typename ToCudaType<T>::MappedType>( \
Stream(), \
prepare.output_rank_or_simple_broadcast, \
&prepare.lhs_padded_strides, \
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(prepare.lhs_tensor->template Data<T>()), \

View file

@ -20,7 +20,8 @@ namespace cuda {
#define CONTRIB_BINARY_ELEMENTWISE_IMPL(name) \
CONTRIB_BINARY_ELEMENTWISE_IMPL_DECLARATION(name) { \
BinaryElementWiseImpl(output_rank_or_simple_broadcast, \
BinaryElementWiseImpl(stream, \
output_rank_or_simple_broadcast, \
lhs_padded_strides, \
lhs_data, \
rhs_padded_strides, \
@ -34,7 +35,8 @@ namespace cuda {
}
#define CONTRIB_SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, T) \
template void Impl_##x<T>(int32_t output_rank, \
template void Impl_##x<T>(cudaStream_t stream, \
int32_t output_rank, \
const TArray<int64_t>* lhs_padded_strides, \
const T* lhs_data, \
const TArray<int64_t>* rhs_padded_strides, \

View file

@ -20,6 +20,7 @@ namespace cuda {
#define CONTRIB_BINARY_ELEMENTWISE_IMPL_DECLARATION(name) \
template <typename T> \
void Impl_##name( \
cudaStream_t stream, \
int32_t output_rank_or_simple_broadcast, \
const TArray<int64_t>* lhs_padded_strides, \
const T* lhs_data, \

View file

@ -42,6 +42,7 @@ Status ComplexMul<T, is_conj>::ComputeInternal(OpKernelContext* context) const {
BinaryElementwisePreparation prepare;
ORT_RETURN_IF_ERROR(Prepare(context, &prepare));
ComplexMul_Impl<typename ToCudaType<T>::MappedType>(
Stream(),
prepare.output_rank_or_simple_broadcast,
&prepare.lhs_padded_strides,
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(prepare.lhs_tensor->template Data<T>()),

View file

@ -90,6 +90,7 @@ __global__ void _ElementWiseWithStrideTwo(
template <typename T>
void ComplexMul_Impl(
cudaStream_t stream,
int32_t output_rank_or_simple_broadcast,
const TArray<int64_t>* lhs_padded_strides,
const T* lhs_data,
@ -110,7 +111,7 @@ void ComplexMul_Impl(
CUDA_LONG N = static_cast<CUDA_LONG>(count);
if (lhs_padded_strides && rhs_padded_strides && lhs_padded_strides->Size() && rhs_padded_strides->Size())
_ElementWiseWithStrideTwo<T, true, true, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
_ElementWiseWithStrideTwo<T, true, true, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
output_rank_or_simple_broadcast,
*lhs_padded_strides,
lhs_data,
@ -123,7 +124,7 @@ void ComplexMul_Impl(
rhs_size,
is_conj);
else if (lhs_padded_strides && lhs_padded_strides->Size())
_ElementWiseWithStrideTwo<T, true, false, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
_ElementWiseWithStrideTwo<T, true, false, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
output_rank_or_simple_broadcast,
*lhs_padded_strides,
lhs_data,
@ -136,7 +137,7 @@ void ComplexMul_Impl(
rhs_size,
is_conj);
else
_ElementWiseWithStrideTwo<T, false, true, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
_ElementWiseWithStrideTwo<T, false, true, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
output_rank_or_simple_broadcast,
*lhs_padded_strides,
lhs_data,
@ -152,6 +153,7 @@ void ComplexMul_Impl(
#define SPECIALIZE_STACKEDCOMPLEXMUL_IMPL(T) \
template void ComplexMul_Impl<T>( \
cudaStream_t stream, \
int32_t output_rank_or_simple_broadcast, \
const TArray<int64_t>* lhs_padded_strides, \
const T* lhs_data, \

View file

@ -13,6 +13,7 @@ using namespace ::onnxruntime::cuda;
template <typename T>
void ComplexMul_Impl(
cudaStream_t stream,
int32_t output_rank_or_simple_broadcast,
const TArray<int64_t>* lhs_padded_strides,
const T* lhs_data,

View file

@ -127,11 +127,11 @@ Status FFTBase<T>::DoFFT(OpKernelContext* context, const Tensor* X, bool complex
Tensor* Y = const_cast<OpKernelContext*>(context)->Output(0, TensorShape(output_dims));
auto* x_data = reinterpret_cast<const CudaT*>(X->template Data<T>());
auto* y_data = reinterpret_cast<CudaT*>(Y->template MutableData<T>());
CUFFT_RETURN_IF_ERROR(cufftSetStream(plan_info.plan, Stream()));
CUFFT_RETURN_IF_ERROR(cufftXtExec(plan_info.plan, const_cast<CudaT*>(x_data), y_data, inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
if (inverse) {
PostProcess(signal_dims, output_size, y_data);
PostProcess(Stream(), signal_dims, output_size, y_data);
}
return Status::OK();

View file

@ -27,14 +27,14 @@ __global__ void _Normalize(
}
template <typename T>
void PostProcess(const std::vector<int64_t>& signal_dims, int64_t N, T* output_data) {
void PostProcess(cudaStream_t stream, const std::vector<int64_t>& signal_dims, int64_t N, T* output_data) {
int64_t scale = std::accumulate(signal_dims.begin(), signal_dims.end(), 1ll, std::multiplies<int64_t>());
int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
_Normalize<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(output_data, N, static_cast<int>(scale));
_Normalize<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(output_data, N, static_cast<int>(scale));
}
#define SPECIALIZED_IMPL(T) \
template void PostProcess<T>(const std::vector<int64_t>& signal_dims, int64_t N, T* output_data);
template void PostProcess<T>(cudaStream_t stream, const std::vector<int64_t>& signal_dims, int64_t N, T* output_data);
SPECIALIZED_IMPL(float)
SPECIALIZED_IMPL(double)

View file

@ -12,7 +12,7 @@ namespace contrib {
namespace cuda {
template <typename T>
void PostProcess(const std::vector<int64_t>& signal_dims, int64_t N, T* output_data);
void PostProcess(cudaStream_t stream, const std::vector<int64_t>& signal_dims, int64_t N, T* output_data);
} // namespace cuda
} // namespace contrib

View file

@ -158,6 +158,7 @@ Status QAttention<T, int8_t>::ComputeInternal(OpKernelContext* context) const {
}
// scale back and bias
CudaDequantizeWithBias(
Stream(),
gemm_buffer_quantized.get(),
reinterpret_cast<const CudaT*>(bias->template Data<T>()),
reinterpret_cast<CudaT*>(gemm_buffer.get()),
@ -172,6 +173,7 @@ Status QAttention<T, int8_t>::ComputeInternal(OpKernelContext* context) const {
auto temp_buffer = GetScratchBuffer<void>(workSpaceSize);
if (!LaunchAttentionKernel(
GetDeviceProp(),
Stream(),
reinterpret_cast<const CudaT*>(gemm_buffer.get()),
nullptr == mask_index ? nullptr : mask_index->template Data<int>(),
nullptr == mask_index ? nullptr : &(mask_index->Shape().GetDims()),

View file

@ -31,10 +31,10 @@ __global__ void DequantizeLinearKernel(const int32_t* quantize, const T* bias, T
}
template <class T>
Status CudaDequantizeWithBias(const int32_t* quantize, const T* bias, T* output, T scale, int m, int n) {
Status CudaDequantizeWithBias(cudaStream_t stream, const int32_t* quantize, const T* bias, T* output, T scale, int m, int n) {
int blocksPerGrid = static_cast<int>(CeilDiv(m * n, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
CUDA_LONG N = static_cast<CUDA_LONG>(m * n);
DequantizeLinearKernel<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
DequantizeLinearKernel<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
quantize,
bias,
output,
@ -44,8 +44,8 @@ Status CudaDequantizeWithBias(const int32_t* quantize, const T* bias, T* output,
return Status::OK();
}
template Status CudaDequantizeWithBias<float>(const int32_t* quantize, const float* bias, float* output, float scale, int m, int n);
template Status CudaDequantizeWithBias<half>(const int32_t* quantize, const half* bias, half* output, half scale, int m, int n);
template Status CudaDequantizeWithBias<float>(cudaStream_t stream, const int32_t* quantize, const float* bias, float* output, float scale, int m, int n);
template Status CudaDequantizeWithBias<half>(cudaStream_t stream, const int32_t* quantize, const half* bias, half* output, half scale, int m, int n);
} // namespace cuda
} // namespace contrib

View file

@ -8,7 +8,7 @@ namespace onnxruntime {
namespace contrib {
namespace cuda {
template <class Tin>
Status CudaDequantizeWithBias(const int32_t* quantize, const Tin* bias, Tin* output, Tin scale, int m, int n);
Status CudaDequantizeWithBias(cudaStream_t stream, const int32_t* quantize, const Tin* bias, Tin* output, Tin scale, int m, int n);
} // namespace cuda
} // namespace contrib

View file

@ -56,6 +56,7 @@ Status Crop<T>::ComputeInternal(OpKernelContext* context) const {
fast_divmod fdm_YHW(gsl::narrow_cast<int>((bottomLimit - topBorder) * (rightLimit - leftBorder)));
CropImpl<CudaT>(
Stream(),
reinterpret_cast<const CudaT*>(X->template Data<T>()),
gsl::narrow_cast<int>(leftBorder),
gsl::narrow_cast<int>(topBorder),

View file

@ -31,6 +31,7 @@ __global__ void _CropKernel(
template <typename T>
void CropImpl(
cudaStream_t stream,
const T* input_data,
const int src_start_x,
const int src_start_y,
@ -41,12 +42,12 @@ void CropImpl(
T* output_data,
const size_t N) {
int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
_CropKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
_CropKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
input_data, src_start_x, src_start_y, src_w, src_hw, fdm_dst_w, fdm_dst_hw, output_data, (CUDA_LONG)N);
}
#define SPECIALIZED_IMPL(T) \
template void CropImpl<T>(const T* input_data, const int src_start_x, const int src_start_y, const int src_w, const int src_hw, const fast_divmod& fdm_dst_w, const fast_divmod& fdm_dst_hw, T* output_data, const size_t N);
template void CropImpl<T>(cudaStream_t stream, const T* input_data, const int src_start_x, const int src_start_y, const int src_w, const int src_hw, const fast_divmod& fdm_dst_w, const fast_divmod& fdm_dst_hw, T* output_data, const size_t N);
SPECIALIZED_IMPL(float)
SPECIALIZED_IMPL(double)

View file

@ -12,6 +12,7 @@ using namespace onnxruntime::cuda;
template <typename T>
void CropImpl(
cudaStream_t stream,
const T* input_data,
const int src_start_x,
const int src_start_y,

View file

@ -30,7 +30,7 @@ ImageScaler<T>::ImageScaler(const OpKernelInfo& info) : CudaKernel(info) {
ORT_ENFORCE(info.GetAttrs<float>("bias", bias_).IsOK());
b_data_ = GetScratchBuffer<float>(bias_.size());
CUDA_CALL_THROW(cudaMemcpy(b_data_.get(), bias_.data(), sizeof(float) * bias_.size(), cudaMemcpyHostToDevice));
CUDA_CALL_THROW(cudaMemcpyAsync(b_data_.get(), bias_.data(), sizeof(float) * bias_.size(), cudaMemcpyHostToDevice, Stream()));
}
template <typename T>
@ -53,6 +53,7 @@ Status ImageScaler<T>::ComputeInternal(OpKernelContext* context) const {
typedef typename ToCudaType<T>::MappedType CudaT;
ImageScalerImpl<CudaT>(
Stream(),
reinterpret_cast<const CudaT*>(X->template Data<T>()),
scale_,
b_data_.get(),

View file

@ -30,6 +30,7 @@ __global__ void _ImageScalerKernel(
template <typename T>
void ImageScalerImpl(
cudaStream_t stream,
const T* input_data,
const float scale,
const float* bias_data,
@ -40,17 +41,17 @@ void ImageScalerImpl(
fast_divmod fdm_HW((int)(dims[2] * dims[3]));
fast_divmod fdm_C;
if (dims[0] == 1) {
_ImageScalerKernel<T, true><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
_ImageScalerKernel<T, true><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
input_data, scale, bias_data, fdm_C, fdm_HW, output_data, N);
} else {
fdm_C = fast_divmod((int)dims[1]);
_ImageScalerKernel<T, false><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
_ImageScalerKernel<T, false><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
input_data, scale, bias_data, fdm_C, fdm_HW, output_data, N);
}
}
#define SPECIALIZED_IMPL(T) \
template void ImageScalerImpl<T>(const T* input_data, const float scale, const float* bias_data, const int64_t dims[4], T* output_data, const size_t N);
template void ImageScalerImpl<T>(cudaStream_t stream, const T* input_data, const float scale, const float* bias_data, const int64_t dims[4], T* output_data, const size_t N);
SPECIALIZED_IMPL(float)
SPECIALIZED_IMPL(double)

View file

@ -10,6 +10,7 @@ namespace cuda {
template <typename T>
void ImageScalerImpl(
cudaStream_t stream,
const T* input_data,
const float scale,
const float* bias_data,

View file

@ -15,6 +15,7 @@ namespace rocm {
template <typename T>
void DispatchBiasSoftmaxForwardImpl(
hipStream_t stream,
Tensor* output_tensor,
const Tensor* input_tensor,
const Tensor* input_bias_tensor,
@ -25,6 +26,7 @@ void DispatchBiasSoftmaxForwardImpl(
template <typename T>
void DispatchBiasSoftMaxForwardViaDnnLibraryImpl(
hipStream_t stream,
miopenHandle_t miopenHandle,
int element_count,
int batch_count,
@ -67,12 +69,12 @@ Status BiasSoftmax::ComputeInternal(OpKernelContext* ctx) const {
// expect thread blocks can fill SM at high occupancy without overflowing registers
utils::MLTypeCallDispatcher<DispatchBiasSoftmaxForward, float, MLFloat16>
t_disp(X->GetElementType());
t_disp.Invoke(Y, X, B, D, N, D, broadcast_size);
t_disp.Invoke(Stream(), Y, X, B, D, N, D, broadcast_size);
} else {
// need to fallback to add kernel + CUDA DNN library softmax call :/
utils::MLTypeCallDispatcher<DispatchBiasSoftMaxForwardViaDnnLibrary, float, MLFloat16>
t_disp(X->GetElementType());
t_disp.Invoke(MiopenHandle(), D, N, broadcast_axis, softmax_axis, X_shape, X, B_shape, B, Y);
t_disp.Invoke(Stream(), MiopenHandle(), D, N, broadcast_axis, softmax_axis, X_shape, X, B_shape, B, Y);
}
return Status::OK();
@ -80,6 +82,7 @@ Status BiasSoftmax::ComputeInternal(OpKernelContext* ctx) const {
template <typename T>
void DispatchBiasSoftmaxForward<T>::operator()(
hipStream_t stream,
Tensor* output,
const Tensor* input,
const Tensor* input_bias,
@ -88,6 +91,7 @@ void DispatchBiasSoftmaxForward<T>::operator()(
int batch_stride,
int bias_broadcast_size_per_batch) {
DispatchBiasSoftmaxForwardImpl<T>(
stream,
output,
input,
input_bias,
@ -99,6 +103,7 @@ void DispatchBiasSoftmaxForward<T>::operator()(
template <typename T>
void DispatchBiasSoftMaxForwardViaDnnLibrary<T>::operator()(
hipStream_t stream,
miopenHandle_t miopenHandle,
int element_count,
int batch_count,
@ -110,6 +115,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibrary<T>::operator()(
const onnxruntime::Tensor* B,
onnxruntime::Tensor* Y) {
DispatchBiasSoftMaxForwardViaDnnLibraryImpl<T>(
stream,
miopenHandle,
element_count,
batch_count,

View file

@ -13,6 +13,7 @@ namespace rocm {
template <typename T>
struct DispatchBiasSoftmaxForward {
void operator()(
hipStream_t stream,
Tensor* output,
const Tensor* input,
const Tensor* input_bias,
@ -25,6 +26,7 @@ struct DispatchBiasSoftmaxForward {
template <typename T>
struct DispatchBiasSoftMaxForwardViaDnnLibrary {
void operator()(
hipStream_t stream,
miopenHandle_t miopenHandle,
int element_count,
int batch_count,

View file

@ -128,6 +128,7 @@ __global__ void BiasSoftmaxWarpForward(
template <typename T>
void DispatchBiasSoftmaxForwardImpl(
hipStream_t stream,
Tensor* output_tensor,
const Tensor* input_tensor,
const Tensor* input_bias_tensor,
@ -168,47 +169,47 @@ void DispatchBiasSoftmaxForwardImpl(
// Launch code would be more elegant if C++ supported FOR CONSTEXPR
switch (log2_elements) {
case 0: // 1
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 0>), dim3(blocks), dim3(threads), 0, 0,
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 0>), dim3(blocks), dim3(threads), 0, stream,
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 1: // 2
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 1>), dim3(blocks), dim3(threads), 0, 0,
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 1>), dim3(blocks), dim3(threads), 0, stream,
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 2: // 4
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 2>), dim3(blocks), dim3(threads), 0, 0,
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 2>), dim3(blocks), dim3(threads), 0, stream,
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 3: // 8
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 3>), dim3(blocks), dim3(threads), 0, 0,
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 3>), dim3(blocks), dim3(threads), 0, stream,
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 4: // 16
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 4>), dim3(blocks), dim3(threads), 0, 0,
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 4>), dim3(blocks), dim3(threads), 0, stream,
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 5: // 32
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 5>), dim3(blocks), dim3(threads), 0, 0,
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 5>), dim3(blocks), dim3(threads), 0, stream,
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 6: // 64
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 6>), dim3(blocks), dim3(threads), 0, 0,
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 6>), dim3(blocks), dim3(threads), 0, stream,
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 7: // 128
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 7>), dim3(blocks), dim3(threads), 0, 0,
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 7>), dim3(blocks), dim3(threads), 0, stream,
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 8: // 256
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 8>), dim3(blocks), dim3(threads), 0, 0,
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 8>), dim3(blocks), dim3(threads), 0, stream,
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 9: // 512
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 9>), dim3(blocks), dim3(threads), 0, 0,
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 9>), dim3(blocks), dim3(threads), 0, stream,
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
case 10: // 1024
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 10>), dim3(blocks), dim3(threads), 0, 0,
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 10>), dim3(blocks), dim3(threads), 0, stream,
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
break;
default:
@ -218,6 +219,7 @@ void DispatchBiasSoftmaxForwardImpl(
#define SPECIALIZED_BIAS_SOFTMAX_IMPL(T) \
template void DispatchBiasSoftmaxForwardImpl<T>( \
hipStream_t stream, \
Tensor * output_tensor, \
const Tensor* input_tensor, \
const Tensor* input_bias_tensor, \
@ -234,6 +236,7 @@ SPECIALIZED_BIAS_SOFTMAX_IMPL(MLFloat16)
// note: This is an unhappy path! There is no performance benefit for the fusion.
template <typename T>
void DispatchBiasSoftMaxForwardViaDnnLibraryImpl(
hipStream_t stream,
miopenHandle_t miopenHandle,
int element_count,
int batch_count,
@ -278,6 +281,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibraryImpl(
// invoke elementwise add with broadcast kernel
::onnxruntime::rocm::BinaryElementWiseImpl(
stream,
(int32_t)X_shape.NumDimensions(),
&lhs_padded_strides,
X_data,
@ -311,6 +315,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibraryImpl(
#define SPECIALIZED_BIAS_SOFTMAX_IMPL_VIA_DNN(T) \
template void DispatchBiasSoftMaxForwardViaDnnLibraryImpl<T>( \
hipStream_t stream, \
miopenHandle_t miopenHandle, \
int element_count, \
int batch_count, \

View file

@ -160,14 +160,16 @@ struct ProviderHostImpl : ProviderHost {
return onnxruntime::make_unique<CUDAPinnedAllocator>(device_id, name);
}
std::unique_ptr<IDataTransfer> CreateGPUDataTransfer() override { return onnxruntime::make_unique<GPUDataTransfer>(); }
void cuda__Impl_Cast(const int64_t* input_data, int32_t* output_data, size_t count) override {
return cuda::Impl_Cast(input_data, output_data, count);
std::unique_ptr<IDataTransfer> CreateGPUDataTransfer(void* stream) override {
return onnxruntime::make_unique<GPUDataTransfer>(static_cast<cudaStream_t>(stream));
}
void cuda__Impl_Cast(const int32_t* input_data, int64_t* output_data, size_t count) override {
return cuda::Impl_Cast(input_data, output_data, count);
void cuda__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) override {
return cuda::Impl_Cast(static_cast<cudaStream_t>(stream), input_data, output_data, count);
}
void cuda__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) override {
return cuda::Impl_Cast(static_cast<cudaStream_t>(stream), input_data, output_data, count);
}
bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return CudaCall<cudaError, false>(cudaError(retCode), exprString, libName, cudaError(successCode), msg); }
@ -684,6 +686,13 @@ std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tensor
return nullptr;
}
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* provider_options) {
if (auto provider = s_library_tensorrt.Get())
return provider->CreateExecutionProviderFactory(provider_options);
return nullptr;
}
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_OpenVINO(const OrtOpenVINOProviderOptions* provider_options) {
if (auto provider = s_library_openvino.Get())
return provider->CreateExecutionProviderFactory(provider_options);
@ -719,6 +728,16 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Tensorrt, _In_ OrtS
return nullptr;
}
ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT, _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options) {
auto factory = onnxruntime::CreateExecutionProviderFactory_Tensorrt(tensorrt_options);
if (!factory) {
return OrtApis::CreateStatus(ORT_FAIL, "SessionOptionsAppendExecutionProvider_Tensorrt: Failed to load shared library");
}
options->provider_factories.push_back(factory);
return nullptr;
}
ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_OpenVINO, _In_ OrtSessionOptions* options, _In_ const OrtOpenVINOProviderOptions* provider_options) {
auto factory = onnxruntime::CreateExecutionProviderFactory_OpenVINO(provider_options);
if (!factory) {

View file

@ -178,7 +178,8 @@ class LoopImpl {
LoopImpl(OpKernelContextInternal& context,
const SessionState& session_state,
const Loop::Info& info,
const Loop::ConcatOutput& concat_output_func);
const Loop::ConcatOutput& concat_output_func,
void* stream);
// Initialize by validating all the inputs, and allocating the output tensors
Status Initialize();
@ -211,9 +212,11 @@ class LoopImpl {
std::vector<std::vector<OrtValue>> loop_output_tensors_;
const Loop::ConcatOutput& concat_output_func_;
void* stream_;
};
static Status ConcatenateCpuOutput(std::vector<OrtValue>& per_iteration_output,
static Status ConcatenateCpuOutput(void* /*stream*/,
std::vector<OrtValue>& per_iteration_output,
void* output, size_t output_size_in_bytes) {
const auto& first_output = per_iteration_output.front().Get<Tensor>();
const auto& per_iteration_shape = first_output.Shape();
@ -253,6 +256,7 @@ Loop::Loop(const OpKernelInfo& info) : IControlFlowKernel(info) {
ORT_IGNORE_RETURN_VALUE(proto);
concat_output_func_ = ConcatenateCpuOutput;
stream_ = nullptr;
}
// we need this to be in the .cc so 'unique_ptr<Info> info_' can be handled
@ -345,7 +349,7 @@ Status Loop::Compute(OpKernelContext* ctx) const {
ORT_ENFORCE(session_state, "Subgraph SessionState was not found for 'body' attribute.");
ORT_ENFORCE(feeds_fetches_manager_, "CreateFeedsFetchesManager must be called prior to execution of graph.");
LoopImpl loop_impl{*ctx_internal, *session_state, *info_, concat_output_func_};
LoopImpl loop_impl{*ctx_internal, *session_state, *info_, concat_output_func_, stream_};
auto status = loop_impl.Initialize();
ORT_RETURN_IF_ERROR(status);
@ -358,12 +362,14 @@ Status Loop::Compute(OpKernelContext* ctx) const {
LoopImpl::LoopImpl(OpKernelContextInternal& context,
const SessionState& session_state,
const Loop::Info& subgraph_info,
const Loop::ConcatOutput& concat_output_func)
const Loop::ConcatOutput& concat_output_func,
void* stream)
: context_(context),
session_state_(session_state),
info_(subgraph_info),
implicit_inputs_(context_.GetImplicitInputs()),
concat_output_func_(concat_output_func) {
concat_output_func_(concat_output_func),
stream_(stream) {
auto* max_trip_count_tensor = context.Input<Tensor>(0);
max_trip_count_ = max_trip_count_tensor ? *max_trip_count_tensor->Data<int64_t>() : INT64_MAX;
@ -457,7 +463,7 @@ Status LoopImpl::ConcatenateLoopOutput(std::vector<OrtValue>& per_iteration_outp
TensorShape output_shape{dims};
Tensor* output = context_.Output(output_index, output_shape);
ORT_RETURN_IF_ERROR(concat_output_func_(per_iteration_output, output->MutableDataRaw(), output->SizeInBytes()));
ORT_RETURN_IF_ERROR(concat_output_func_(stream_, per_iteration_output, output->MutableDataRaw(), output->SizeInBytes()));
return Status::OK();
}

View file

@ -29,17 +29,19 @@ class Loop : public controlflow::IControlFlowKernel {
// function to concatenate the OrtValue instances from each Loop iteration into a single output buffer.
// @param per_iteration_output OrtValue instances from each iteration. Never empty. All should have the same shape.
// @param output Pre-allocated output buffer. On device specific to the ExecutionProvider running the Loop node.
using ConcatOutput = std::function<Status(std::vector<OrtValue>& per_iteration_output,
using ConcatOutput = std::function<Status(void* stream, std::vector<OrtValue>& per_iteration_output,
void* output, size_t output_size_in_bytes)>;
protected:
// derived class can provide implementation for handling concatenation of Loop output on a different device
void SetConcatOutputFunc(const ConcatOutput& concat_output_func) { concat_output_func_ = concat_output_func; }
void SetComputeStream(void* stream) { stream_ = stream; }
private:
// Info and FeedsFetchesManager re-used for each subgraph execution.
std::unique_ptr<Info> info_;
std::unique_ptr<FeedsFetchesManager> feeds_fetches_manager_;
ConcatOutput concat_output_func_;
void* stream_;
};
} // namespace onnxruntime

View file

@ -14,7 +14,7 @@ namespace DeviceHelpers {
namespace CpuDeviceHelpers {
// CPU specific Data copy helper
Status DataCopy(const Tensor& input, Tensor& output) {
Status DataCopy(const Tensor& input, Tensor& output, void* /*einsum_cuda_assets*/) {
ORT_ENFORCE(output.SizeInBytes() == input.SizeInBytes(),
"Einsum op: The candidate output does not match the actual output's shape");
// There are no string tensors in Einsum's case - so safely use memcpy
@ -156,7 +156,7 @@ static std::unique_ptr<Tensor> DiagonalInnermostDims(const Tensor& input,
return output;
}
std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator) {
std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator, void* /*einsum_cuda_assets*/) {
const auto& input_shape = input.Shape();
const auto& input_dims = input_shape.GetDims();
auto rank = static_cast<int64_t>(input_dims.size());

View file

@ -23,7 +23,7 @@ namespace EinsumOp {
namespace DeviceHelpers {
// Data copy op - Copies raw data from the source tensor's buffer to the destination tensor's buffer
using DataCopy = std::function<Status(const Tensor& input, Tensor& output)>;
using DataCopy = std::function<Status(const Tensor& input, Tensor& output, void* einsum_cuda_assets)>;
// Transpose op - Transposes given input based on data in `permutation`
using Transpose = std::function<Status(const std::vector<size_t>& permutation, const Tensor& input,
@ -54,12 +54,12 @@ using ReduceSum = std::function<Tensor(const Tensor& input, const std::vector<in
// Eg. input_shape = [2, 3, 5, 3] and dim_1 = 1 and dim_2 = 3
// The output_shape will be [2, 3, 5] and dim_1 will contain the diagonal elements
using Diagonal = std::function<std::unique_ptr<Tensor>(const Tensor& input, int64_t dim_1, int64_t dim_2,
AllocatorPtr allocator)>;
AllocatorPtr allocator, void* einsum_cuda_assets)>;
// These are CPU specific device helper implementations
namespace CpuDeviceHelpers {
Status DataCopy(const Tensor& input, Tensor& output);
Status DataCopy(const Tensor& input, Tensor& output, void* einsum_cuda_assets);
Status Transpose(const std::vector<size_t>& permutation, const Tensor& input,
Tensor& output, const TensorShape* input_shape_override, void* einsum_cuda_assets);
@ -76,7 +76,7 @@ Tensor ReduceSum(const Tensor& input, const std::vector<int64_t>& reduce_axes,
const TensorShape* input_shape_override,
concurrency::ThreadPool* tp, void* einsum_cuda_assets);
std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator);
std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator, void* einsum_cuda_assets);
} // namespace CpuDeviceHelpers

View file

@ -440,7 +440,7 @@ Status EinsumComputePreprocessor::PreprocessInputs() {
preprocessed = device_diagonal_func_(preprocessed ? *preprocessed : *inputs_[input_iter],
subscript_indices_to_input_index[subscript_index],
dim_index_in_preprocessed_input,
allocator_);
allocator_, einsum_ep_assets_);
}
++dim_index_in_original_input;
}

View file

@ -58,13 +58,13 @@ void EinsumTypedComputeProcessor<T>::FinalizeOutput(const Tensor& candidate_outp
// into the buffer of the actual output given to us by the execution frame
// We need to do this because the buffer owned by the output tensor of the op could be user provided buffer
auto status = device_data_copy_func_(*candidate_output_transposed, output);
auto status = device_data_copy_func_(*candidate_output_transposed, output, einsum_ep_assets_);
ORT_ENFORCE(status.IsOK(), "Einsum op: Could not copy the intermediate output's buffer into the op's output buffer. Error: ",
status.ErrorMessage());
} else {
// Copy the output candidate into the op's output
auto status = device_data_copy_func_(candidate_output, output);
auto status = device_data_copy_func_(candidate_output, output, einsum_ep_assets_);
ORT_ENFORCE(status.IsOK(), "Einsum op: Could not copy the intermediate output's buffer into the op's output buffer. Error: ",
status.ErrorMessage());
}

View file

@ -38,6 +38,7 @@ namespace cuda {
ORT_RETURN_IF_ERROR(UnaryElementwise::Prepare(context, &p)); \
Ctx##x func_ctx = MakeFuncCtx(); \
Impl_##x<typename ToCudaType<T>::MappedType>( \
Stream(), \
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(p.input_tensor->template Data<T>()), \
reinterpret_cast<typename ToCudaType<T>::MappedType*>(p.output_tensor->template MutableData<T>()), \
&func_ctx, p.output_tensor->Shape().Size()); \

View file

@ -84,14 +84,15 @@ struct OP_ThresholdedRelu : public CtxThresholdedRelu {
#define UNARY_ACTIVATION_IMPL(name) \
UNARY_ACTIVATION_IMPL_DECLARATION(name) { \
UnaryElementWiseImpl(input_data, \
UnaryElementWiseImpl(stream, \
input_data, \
output_data, \
*reinterpret_cast<const OP_##name<T>*>(func_ctx), \
count); \
}
#define SPECIALIZED_UNARY_ACTIVATION_IMPL(name, T) \
template void Impl_##name<T>(const T* input_data, T* output_data, const Ctx##name* func_ctx, size_t count);
template void Impl_##name<T>(cudaStream_t stream, const T* input_data, T* output_data, const Ctx##name* func_ctx, size_t count);
#if CUDA_VERSION >= 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
#define SPECIALIZED_UNARY_ACTIVATION_IMPL_BF16(name) SPECIALIZED_UNARY_ACTIVATION_IMPL(name, nv_bfloat16)

View file

@ -48,6 +48,7 @@ typedef CtxAlpha CtxThresholdedRelu;
#define UNARY_ACTIVATION_IMPL_DECLARATION(name) \
template <typename T> \
void Impl_##name( \
cudaStream_t stream, \
const T* input_data, \
T* output_data, \
const Ctx##name* func_ctx, \

View file

@ -51,7 +51,7 @@ ONNX_OPERATOR_KERNEL_EX(Loop,
.TypeConstraint("V", DataTypeImpl::AllFixedSizeTensorTypes()),
Loop);
static Status ConcatenateGpuOutput(std::vector<OrtValue>& per_iteration_output,
static Status ConcatenateGpuOutput(void* stream, std::vector<OrtValue>& per_iteration_output,
void* output, ptrdiff_t output_size_in_bytes) {
const auto& first_output = per_iteration_output.front().Get<Tensor>();
const auto& per_iteration_shape = first_output.Shape();
@ -68,8 +68,8 @@ static Status ConcatenateGpuOutput(std::vector<OrtValue>& per_iteration_output,
" Expected:", per_iteration_shape, " Got:", iteration_data.Shape());
}
CUDA_RETURN_IF_ERROR(cudaMemcpy(cur_output, iteration_data.DataRaw(), bytes_per_iteration,
cudaMemcpyDeviceToDevice));
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cur_output, iteration_data.DataRaw(), bytes_per_iteration,
cudaMemcpyDeviceToDevice, static_cast<cudaStream_t>(stream)));
cur_output = static_cast<void*>((static_cast<gsl::byte*>(cur_output) + bytes_per_iteration));
}
@ -82,6 +82,7 @@ static Status ConcatenateGpuOutput(std::vector<OrtValue>& per_iteration_output,
Loop::Loop(const OpKernelInfo& info) : onnxruntime::Loop(info) {
SetConcatOutputFunc(ConcatenateGpuOutput);
SetComputeStream(static_cast<void*>(info.GetExecutionProvider()->GetComputeStream()));
}
Status Loop::Compute(OpKernelContext* ctx) const {

View file

@ -180,6 +180,7 @@ __global__ void _BinaryElementWiseRhsPerChannelBatchN(
template <typename T, typename T1, typename T2, typename FuncT>
void BinaryElementWiseNoBroadcastImpl(
cudaStream_t stream,
const T1* lhs_data,
const T2* rhs_data,
T* output_data,
@ -190,7 +191,7 @@ void BinaryElementWiseNoBroadcastImpl(
int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
CUDA_LONG N = static_cast<CUDA_LONG>(count);
_BinaryElementWiseSimple<true, true, T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
_BinaryElementWiseSimple<true, true, T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
lhs_data,
rhs_data,
output_data,
@ -200,6 +201,7 @@ void BinaryElementWiseNoBroadcastImpl(
template <typename T, typename T1, typename T2, typename FuncT>
void BinaryElementWiseImpl(
cudaStream_t stream,
int32_t output_rank_or_simple_broadcast,
const TArray<int64_t>* lhs_padded_strides,
const T1* lhs_data,
@ -217,14 +219,14 @@ void BinaryElementWiseImpl(
int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
CUDA_LONG N = static_cast<CUDA_LONG>(count);
if (output_rank_or_simple_broadcast == static_cast<int32_t>(SimpleBroadcast::NoBroadcast)) {
_BinaryElementWiseSimple<true, true, T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
_BinaryElementWiseSimple<true, true, T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
lhs_data,
rhs_data,
output_data,
func,
N);
} else if (output_rank_or_simple_broadcast == static_cast<int32_t>(SimpleBroadcast::LeftScalar)) {
_BinaryElementWiseSimple<false, true, T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
_BinaryElementWiseSimple<false, true, T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
lhs_data,
rhs_data,
output_data,
@ -232,14 +234,14 @@ void BinaryElementWiseImpl(
N);
} else if (output_rank_or_simple_broadcast == static_cast<int32_t>(SimpleBroadcast::RightScalar)) {
_BinaryElementWiseSimple<true, false, T, T1, T2, FuncT, GridDim::maxThreadsPerBlock,
GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
lhs_data,
rhs_data,
output_data,
func,
N);
} else if (output_rank_or_simple_broadcast == static_cast<int32_t>(SimpleBroadcast::RightPerChannelBatch1)) {
_BinaryElementWiseRhsPerChannelBatch1<T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
_BinaryElementWiseRhsPerChannelBatch1<T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
lhs_data,
rhs_data,
fdm_H,
@ -247,7 +249,7 @@ void BinaryElementWiseImpl(
func,
N);
} else if (output_rank_or_simple_broadcast == static_cast<int32_t>(SimpleBroadcast::RightPerChannelBatchN)) {
_BinaryElementWiseRhsPerChannelBatchN<T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
_BinaryElementWiseRhsPerChannelBatchN<T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
lhs_data,
rhs_data,
fdm_H,
@ -257,7 +259,7 @@ void BinaryElementWiseImpl(
N);
} else {
if (lhs_padded_strides && rhs_padded_strides && lhs_padded_strides->Size() && rhs_padded_strides->Size())
_BinaryElementWise<T, T1, T2, FuncT, true, true, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
_BinaryElementWise<T, T1, T2, FuncT, true, true, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
output_rank_or_simple_broadcast,
*lhs_padded_strides,
lhs_data,
@ -268,7 +270,7 @@ void BinaryElementWiseImpl(
func,
N);
else if (lhs_padded_strides && lhs_padded_strides->Size())
_BinaryElementWise<T, T1, T2, FuncT, true, false, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
_BinaryElementWise<T, T1, T2, FuncT, true, false, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
output_rank_or_simple_broadcast,
*lhs_padded_strides,
lhs_data,
@ -279,7 +281,7 @@ void BinaryElementWiseImpl(
func,
N);
else if (rhs_padded_strides && rhs_padded_strides->Size())
_BinaryElementWise<T, T1, T2, FuncT, false, true, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
_BinaryElementWise<T, T1, T2, FuncT, false, true, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
output_rank_or_simple_broadcast,
TArray<int64_t>(), // lhs is not computed, so no need to deference lhs_padded_strides
lhs_data,

View file

@ -39,6 +39,7 @@ __global__ void _UnaryElementWise(
template <typename InT, typename OutT, typename FuncT>
void UnaryElementWiseImpl(
cudaStream_t stream,
const InT* input_data,
OutT* output_data,
const FuncT& func,
@ -49,7 +50,7 @@ void UnaryElementWiseImpl(
int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
CUDA_LONG N = static_cast<CUDA_LONG>(count);
_UnaryElementWise<InT, OutT, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
input_data,
output_data,
func,

View file

@ -62,6 +62,7 @@ __global__ void VariadicElementWiseNoBroadcastInputBatchKernel(
// - inputs and output have N elements
template <typename T, typename Func, int32_t max_input_batch_size>
void VariadicElementWiseNoBroadcastInputBatchImpl(
cudaStream_t stream,
Func func,
size_t N,
TArray<const T*, max_input_batch_size> inputs,
@ -70,7 +71,7 @@ void VariadicElementWiseNoBroadcastInputBatchImpl(
constexpr int32_t threads_per_block = GridDim::maxThreadsPerBlock;
const int32_t blocks_per_grid = static_cast<int32_t>(CeilDiv(N, elements_per_thread * threads_per_block));
VariadicElementWiseNoBroadcastInputBatchKernel<T, Func, max_input_batch_size, elements_per_thread>
<<<blocks_per_grid, threads_per_block>>>(func, N, inputs, output);
<<<blocks_per_grid, threads_per_block, 0, stream>>>(func, N, inputs, output);
}
} // namespace cuda

View file

@ -59,10 +59,15 @@ ONNX_OPERATOR_KERNEL_EX(
} // namespace cuda
CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy) {
CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, cudaStream_t stream, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy) {
CUDA_CALL_THROW(cudaSetDevice(device_id));
stream_ = stream;
CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_));
CUBLAS_CALL_THROW(cublasSetStream(cublas_handle_, stream));
CUDNN_CALL_THROW(cudnnCreate(&cudnn_handle_));
CUDNN_CALL_THROW(cudnnSetStream(cudnn_handle_, stream));
AllocatorCreationInfo default_memory_info(
[](OrtDevice::DeviceId id) {
@ -103,6 +108,12 @@ CUDAExecutionProvider::CUDAExecutionProvider(const CUDAExecutionProviderInfo& in
// must wait GPU idle, otherwise cudaGetDeviceProperties might fail
CUDA_CALL_THROW(cudaDeviceSynchronize());
CUDA_CALL_THROW(cudaGetDeviceProperties(&device_prop_, info_.device_id));
if (info.has_user_compute_stream) {
external_stream_ = true;
stream_ = static_cast<cudaStream_t>(info.user_compute_stream);
} else {
CUDA_CALL_THROW(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
}
size_t free = 0;
size_t total = 0;
@ -136,6 +147,10 @@ CUDAExecutionProvider::~CUDAExecutionProvider() {
ORT_IGNORE_RETURN_VALUE(cache->erase(this));
}
}
if (!external_stream_ && stream_) {
CUDA_CALL(cudaStreamDestroy(stream_));
}
}
CUDAExecutionProvider::PerThreadContext& CUDAExecutionProvider::GetPerThreadContext() const {
@ -156,7 +171,7 @@ CUDAExecutionProvider::PerThreadContext& CUDAExecutionProvider::GetPerThreadCont
// get or create a context
if (context_state_.retired_context_pool.empty()) {
context = std::make_shared<PerThreadContext>(info_.device_id, info_.cuda_mem_limit, info_.arena_extend_strategy);
context = std::make_shared<PerThreadContext>(info_.device_id, static_cast<cudaStream_t>(GetComputeStream()), info_.cuda_mem_limit, info_.arena_extend_strategy);
} else {
context = context_state_.retired_context_pool.back();
context_state_.retired_context_pool.pop_back();
@ -254,10 +269,24 @@ Status CUDAExecutionProvider::OnRunStart() {
Status CUDAExecutionProvider::OnRunEnd() {
// record deferred release event on default stream, and release per_thread_context
auto current_deferred_release_event = GetPerThreadContext().GetCurrentDeferredReleaseEvent();
CUDA_RETURN_IF_ERROR(cudaEventRecord(current_deferred_release_event, nullptr));
CUDA_RETURN_IF_ERROR(cudaEventRecord(current_deferred_release_event, static_cast<cudaStream_t>(GetComputeStream())));
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(static_cast<cudaStream_t>(GetComputeStream())));
ReleasePerThreadContext();
std::lock_guard<OrtMutex> lock(deferred_release_cpu_ptr_mutex_);
deferred_release_cpu_ptr_[current_deferred_release_event].recorded = true;
return Status::OK();
}
Status CUDAExecutionProvider::SetComputeStream(void* stream) {
if (stream != stream_) {
if (stream_) {
CUDA_RETURN_IF_ERROR(cudaStreamDestroy(stream_));
}
external_stream_ = true;
stream_ = static_cast<cudaStream_t>(stream);
}
return Status::OK();
}
@ -1878,7 +1907,7 @@ static bool CastNeedFallbackToCPU(const onnxruntime::Node& node) {
}
std::unique_ptr<onnxruntime::IDataTransfer> CUDAExecutionProvider::GetDataTransfer() const {
return onnxruntime::make_unique<onnxruntime::GPUDataTransfer>(info_.do_copy_in_default_stream);
return onnxruntime::make_unique<onnxruntime::GPUDataTransfer>(static_cast<cudaStream_t>(GetComputeStream()), info_.do_copy_in_default_stream);
}
std::vector<std::unique_ptr<ComputeCapability>>

View file

@ -13,8 +13,8 @@
#include "core/platform/ort_mutex.h"
#include "core/providers/cuda/cuda_execution_provider_info.h"
#include "core/providers/cuda/cuda_pch.h"
#include "core/providers/cuda/gpu_data_transfer.h"
#include "core/providers/cuda/shared_inc/cuda_utils.h"
#include "core/providers/cuda/shared_inc/cuda_call.h"
namespace onnxruntime {
@ -37,6 +37,10 @@ class CUDAExecutionProvider : public IExecutionProvider {
return nullptr;
}
Status SetComputeStream(void* stream) override;
void* GetComputeStream() const override { return static_cast<void*>(stream_); }
cublasHandle_t PerThreadCublasHandle() {
return GetPerThreadContext().CublasHandle();
}
@ -80,6 +84,8 @@ class CUDAExecutionProvider : public IExecutionProvider {
private:
CUDAExecutionProviderInfo info_;
cudaDeviceProp device_prop_;
bool external_stream_ = false;
cudaStream_t stream_ = nullptr;
struct DeferredReleaseCPUPtrs {
bool recorded = false;
std::vector<void*> cpu_ptrs;
@ -90,7 +96,7 @@ class CUDAExecutionProvider : public IExecutionProvider {
class PerThreadContext final {
public:
PerThreadContext(OrtDevice::DeviceId device_id, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy);
PerThreadContext(OrtDevice::DeviceId device_id, cudaStream_t stream, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy);
~PerThreadContext();
cublasHandle_t CublasHandle() const {
@ -111,23 +117,23 @@ class CUDAExecutionProvider : public IExecutionProvider {
if (!constant_ones_float_) {
constant_ones_float_ = cuda::CreateConstantOnes<float>();
}
return reinterpret_cast<const T*>(constant_ones_float_->GetBuffer(count));
return reinterpret_cast<const T*>(constant_ones_float_->GetBuffer(stream_, count));
} else if (std::is_same<T, double>::value) {
if (!constant_ones_double_) {
constant_ones_double_ = cuda::CreateConstantOnes<double>();
}
return reinterpret_cast<const T*>(constant_ones_double_->GetBuffer(count));
return reinterpret_cast<const T*>(constant_ones_double_->GetBuffer(stream_, count));
} else if (std::is_same<T, half>::value) {
if (!constant_ones_half_) {
constant_ones_half_ = cuda::CreateConstantOnes<half>();
}
return reinterpret_cast<const T*>(constant_ones_half_->GetBuffer(count));
return reinterpret_cast<const T*>(constant_ones_half_->GetBuffer(stream_, count));
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
} else if (std::is_same<T, nv_bfloat16>::value) {
if (!constant_ones_bfloat16_) {
constant_ones_bfloat16_ = cuda::CreateConstantOnes<nv_bfloat16>();
}
return reinterpret_cast<const T*>(constant_ones_bfloat16_->GetBuffer(count));
return reinterpret_cast<const T*>(constant_ones_bfloat16_->GetBuffer(stream_, count));
#endif
} else {
return nullptr;
@ -139,6 +145,7 @@ class CUDAExecutionProvider : public IExecutionProvider {
}
private:
cudaStream_t stream_ = nullptr;
cublasHandle_t cublas_handle_ = nullptr;
cudnnHandle_t cudnn_handle_ = nullptr;

View file

@ -18,6 +18,8 @@ struct CUDAExecutionProviderInfo {
ArenaExtendStrategy arena_extend_strategy{ArenaExtendStrategy::kNextPowerOfTwo};
OrtCudnnConvAlgoSearch cudnn_conv_algo_search{OrtCudnnConvAlgoSearch::EXHAUSTIVE};
bool do_copy_in_default_stream{true};
bool has_user_compute_stream{false};
void* user_compute_stream{nullptr};
static CUDAExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
static ProviderOptions ToProviderOptions(const CUDAExecutionProviderInfo& info);

View file

@ -59,7 +59,9 @@ class CudaKernel : public OpKernel {
provider_->AddDeferredReleaseCPUPtr(p);
}
const cudaDeviceProp& GetDeviceProp() const { return provider_->GetDeviceProp(); };
const cudaDeviceProp& GetDeviceProp() const { return provider_->GetDeviceProp(); }
inline cudaStream_t Stream() const { return static_cast<cudaStream_t>(provider_->GetComputeStream()); }
// To support cudaMemcpyAsync, the cpu memory should be allocated in pinned memory
// and it can only be released after the copy has finished
@ -94,7 +96,7 @@ class CudaKernel : public OpKernel {
Status CopyToGpu() {
if (cpu_pinned_copy_) {
gpu_copy_ = op_kernel_->GetScratchBuffer<T>(count_);
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(gpu_copy_.get(), cpu_pinned_copy_.get(), count_ * sizeof(T), cudaMemcpyHostToDevice));
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(gpu_copy_.get(), cpu_pinned_copy_.get(), count_ * sizeof(T), cudaMemcpyHostToDevice, op_kernel_->Stream()));
op_kernel_->AddDeferredReleaseCPUPtr(cpu_pinned_copy_.release());
}
return Status::OK();

View file

@ -57,7 +57,8 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_CUDA,
info.arena_extend_strategy = static_cast<onnxruntime::ArenaExtendStrategy>(cuda_options->arena_extend_strategy);
info.cudnn_conv_algo_search = cuda_options->cudnn_conv_algo_search;
info.do_copy_in_default_stream = cuda_options->do_copy_in_default_stream;
info.has_user_compute_stream = cuda_options->has_user_compute_stream;
info.user_compute_stream = cuda_options->user_compute_stream;
options->provider_factories.push_back(onnxruntime::CreateExecutionProviderFactory_CUDA(info));
return nullptr;

View file

@ -27,11 +27,11 @@ __global__ void _Fill(
}
template <typename T>
void Fill(T* output, T value, int64_t count) {
void Fill(cudaStream_t stream, T* output, T value, int64_t count) {
int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
CUDA_LONG N = static_cast<CUDA_LONG>(count);
_Fill<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(output, value, N);
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(output, value, N);
}
template <typename T>
class ConstantBufferImpl : public IConstantBuffer<T> {
@ -43,7 +43,7 @@ class ConstantBufferImpl : public IConstantBuffer<T> {
cudaFree(buffer_);
}
virtual const T* GetBuffer(size_t count) {
virtual const T* GetBuffer(cudaStream_t stream, size_t count) {
if (count > count_) {
if (buffer_) {
cudaFree(buffer_);
@ -52,7 +52,7 @@ class ConstantBufferImpl : public IConstantBuffer<T> {
CUDA_CALL_THROW(cudaMalloc(&buffer_, count * sizeof(T)));
count_ = count;
Fill(buffer_, val_, count);
Fill(stream, buffer_, val_, count);
}
return buffer_;
}
@ -76,7 +76,7 @@ template std::unique_ptr<IConstantBuffer<nv_bfloat16>> CreateConstantOnes<nv_bfl
#endif
#define SPECIALIZED_FILL(T) \
template void Fill<T>(T * output, T value, int64_t count);
template void Fill<T>(cudaStream_t stream, T * output, T value, int64_t count);
SPECIALIZED_FILL(int8_t)
SPECIALIZED_FILL(int16_t)

View file

@ -65,30 +65,30 @@ __global__ void CopyVectorBFloat16(const nv_bfloat16* x, int incx, nv_bfloat16*
} // namespace
cublasStatus_t cublasTransposeHelper(cublasHandle_t, cublasOperation_t, cublasOperation_t, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int) {
cublasStatus_t cublasTransposeHelper(cudaStream_t stream, cublasHandle_t, cublasOperation_t, cublasOperation_t, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int) {
if (C != A) {
dim3 dimGrid((n + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, (m + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, 1);
dim3 dimBlock(TRANS_TILE_DIM, BLOCK_ROWS, 1);
transposeNoOverlap<<<dimGrid, dimBlock>>>(C, A, n, m);
transposeNoOverlap<<<dimGrid, dimBlock, 0, stream>>>(C, A, n, m);
} else {
return CUBLAS_STATUS_NOT_SUPPORTED;
}
return CUBLAS_STATUS_SUCCESS;
}
cublasStatus_t cublasCopyHelper(cublasHandle_t, int n, const half* x, int incx, half* y, int incy) {
cublasStatus_t cublasCopyHelper(cudaStream_t stream, cublasHandle_t, int n, const half* x, int incx, half* y, int incy) {
dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1);
dim3 dimBlock(COPY_BLOCK_DIM, 1, 1);
CopyVectorHalf<<<dimGrid, dimBlock>>>(x, incx, y, incy, n);
CopyVectorHalf<<<dimGrid, dimBlock, 0, stream>>>(x, incx, y, incy, n);
return CUBLAS_STATUS_SUCCESS;
}
#if CUDA_VERSION >= 11000
cublasStatus_t cublasCopyHelper(cublasHandle_t, int n, const nv_bfloat16* x, int incx, nv_bfloat16* y, int incy) {
cublasStatus_t cublasCopyHelper(cudaStream_t stream, cublasHandle_t, int n, const nv_bfloat16* x, int incx, nv_bfloat16* y, int incy) {
dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1);
dim3 dimBlock(COPY_BLOCK_DIM, 1, 1);
CopyVectorBFloat16<<<dimGrid, dimBlock>>>(x, incx, y, incy, n);
CopyVectorBFloat16<<<dimGrid, dimBlock, 0, stream>>>(x, incx, y, incy, n);
return CUBLAS_STATUS_SUCCESS;
}

View file

@ -32,7 +32,7 @@ Status ConstantOfShape::ComputeInternal(OpKernelContext* ctx) const {
#define CASE(TYPE) \
case sizeof(TYPE): \
if (size > 0) { \
cuda::Fill(reinterpret_cast<TYPE*>(output_data), *(reinterpret_cast<const TYPE*>(value_ptr)), size); \
cuda::Fill(Stream(), reinterpret_cast<TYPE*>(output_data), *(reinterpret_cast<const TYPE*>(value_ptr)), size); \
} \
break;

View file

@ -30,7 +30,7 @@ ONNX_OPERATOR_KERNEL_EX(
Range);
template <typename T>
static Status ComputeRange(OpKernelContext* ctx) {
static Status ComputeRange(cudaStream_t stream, OpKernelContext* ctx) {
const auto& start_tensor = *ctx->Input<Tensor>(0);
const auto& limit_tensor = *ctx->Input<Tensor>(1);
const auto* delta_tensor_ptr = ctx->Input<Tensor>(2);
@ -71,7 +71,7 @@ static Status ComputeRange(OpKernelContext* ctx) {
T* y = ctx->Output(0, shape)->template MutableData<T>();
if (count > 0) {
if (!RangeImpl(start, delta, count, y)) {
if (!RangeImpl(stream, start, delta, count, y)) {
CUDA_CALL(cudaGetLastError());
return Status(common::ONNXRUNTIME, common::FAIL);
}
@ -84,8 +84,8 @@ namespace cuda_range_internal {
template <class T>
struct CallCudaRangeImpl {
Status operator()(OpKernelContext* ctx) const {
return ComputeRange<T>(ctx);
Status operator()(cudaStream_t stream, OpKernelContext* ctx) const {
return ComputeRange<T>(stream, ctx);
}
};
@ -100,7 +100,7 @@ Status Range::ComputeInternal(OpKernelContext* ctx) const {
utils::MLTypeCallDispatcherRet<Status, cuda_range_internal::CallCudaRangeImpl, int32_t,
float, int64_t, double, int16_t>
t_disp(input_tensor->GetElementType());
return t_disp.Invoke(ctx);
return t_disp.Invoke(Stream(), ctx);
}
} // namespace cuda

View file

@ -22,15 +22,15 @@ __global__ void RangeKernel(const T start, const T delta, const int count, T* ou
}
template <typename T>
bool RangeImpl(const T start, const T delta, const int count, T* output) {
bool RangeImpl(cudaStream_t stream, const T start, const T delta, const int count, T* output) {
constexpr int block_size = 256;
int grid_size = (count + block_size - 1) / block_size;
RangeKernel<T><<<grid_size, block_size, 0>>>(start, delta, count, output);
RangeKernel<T><<<grid_size, block_size, 0, stream>>>(start, delta, count, output);
return CUDA_CALL(cudaPeekAtLastError());
}
#define SPECIALIZED_IMPL(T) \
template bool RangeImpl<T>(const T start, const T delta, const int count, T* output);
template bool RangeImpl<T>(cudaStream_t stream, const T start, const T delta, const int count, T* output);
SPECIALIZED_IMPL(int16_t)
SPECIALIZED_IMPL(int32_t)

View file

@ -9,7 +9,7 @@ namespace cuda {
template <typename T>
bool RangeImpl(const T start, const T delta, const int count, T* output);
bool RangeImpl(cudaStream_t stream, const T start, const T delta, const int count, T* output);
} // namespace cuda
} // namespace onnxruntime

View file

@ -9,12 +9,13 @@
// so we leave it as optional, in case user need the previous behavior
// a full fix to BFC arena is being looked at, and once it's in, we can revert this change
namespace onnxruntime {
GPUDataTransfer::GPUDataTransfer(bool do_copy_in_default_stream) {
GPUDataTransfer::GPUDataTransfer(cudaStream_t stream, bool do_copy_in_default_stream) {
// create streams, default is nullptr
streams_[kCudaStreamDefault] = nullptr;
do_copy_in_default_stream_ = do_copy_in_default_stream;
streams_[kCudaStreamDefault] = stream;
if (do_copy_in_default_stream) {
streams_[kCudaStreamCopyIn] = nullptr;
streams_[kCudaStreamCopyOut] = nullptr;
streams_[kCudaStreamCopyIn] = stream;
streams_[kCudaStreamCopyOut] = stream;
} else {
CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyIn], cudaStreamNonBlocking));
CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyOut], cudaStreamNonBlocking));
@ -22,10 +23,10 @@ GPUDataTransfer::GPUDataTransfer(bool do_copy_in_default_stream) {
}
GPUDataTransfer::~GPUDataTransfer() {
if (streams_[kCudaStreamCopyIn] != nullptr) {
if (!do_copy_in_default_stream_ && streams_[kCudaStreamCopyIn] != nullptr) {
CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyIn]));
}
if (streams_[kCudaStreamCopyOut] != nullptr) {
if (!do_copy_in_default_stream_ && streams_[kCudaStreamCopyOut] != nullptr) {
CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyOut]));
}
}
@ -46,24 +47,26 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int e
if (dst_device.Type() == OrtDevice::GPU) {
if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
// copy from pinned memory to GPU, this is non-blocking
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, streams_[exec_queue_id]));
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, GetStream(exec_queue_id)));
} else if (src_device.Type() == OrtDevice::GPU) {
// copying between GPU, this is non-blocking
// Copy only if the two addresses are different.
if (dst_data != src_data) {
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, streams_[kCudaStreamDefault]));
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, GetStream(kCudaStreamDefault)));
}
} else {
// copy from other CPU memory to GPU, this is blocking
CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice));
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, GetStream(kCudaStreamDefault)));
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(GetStream(kCudaStreamDefault)));
}
} else if (src_device.Type() == OrtDevice::GPU) {
if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
// copying from GPU to pinned memory, this is non-blocking
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, streams_[exec_queue_id]));
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, GetStream(exec_queue_id)));
} else {
// copying from GPU to CPU memory, this is blocking
CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost));
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, GetStream(kCudaStreamDefault)));
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(GetStream(kCudaStreamDefault)));
}
} else {
// copying between cpu memory

View file

@ -17,7 +17,7 @@ enum CUDAStreamType : int {
class GPUDataTransfer : public IDataTransfer {
public:
GPUDataTransfer(bool do_copy_in_default_stream = true);
GPUDataTransfer(cudaStream_t stream, bool do_copy_in_default_stream = true);
~GPUDataTransfer();
bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
@ -32,6 +32,7 @@ class GPUDataTransfer : public IDataTransfer {
}
private:
bool do_copy_in_default_stream_;
cudaStream_t streams_[kTotalCudaStreams];
};

View file

@ -145,6 +145,7 @@ Status BinaryElementwise<ShouldBroadcast>::Prepare(OpKernelContext* context, Bin
BinaryElementwisePreparation prepare; \
ORT_RETURN_IF_ERROR(Prepare(context, &prepare)); \
Impl_##x<typename ToCudaType<T>::MappedType>( \
Stream(), \
prepare.output_rank_or_simple_broadcast, \
&prepare.lhs_padded_strides, \
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(prepare.lhs_tensor->template Data<T>()), \
@ -315,12 +316,13 @@ ONNX_OPERATOR_KERNEL_EX(
namespace pow12_internal {
template <class T>
Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) {
Status DispatchOnFirstArg(cudaStream_t stream, const BinaryElementwisePreparation& prepare) {
namespace on = ONNX_NAMESPACE;
Status s;
switch (prepare.rhs_tensor->GetElementType()) {
case on::TensorProto_DataType_INT32:
ImplT1_Pow<typename ToCudaType<T>::MappedType, typename ToCudaType<int32_t>::MappedType>(
stream,
prepare.output_rank_or_simple_broadcast,
&prepare.lhs_padded_strides,
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(prepare.lhs_tensor->template Data<T>()),
@ -334,6 +336,7 @@ Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) {
break;
case on::TensorProto_DataType_INT64:
ImplT1_Pow<typename ToCudaType<T>::MappedType, typename ToCudaType<int64_t>::MappedType>(
stream,
prepare.output_rank_or_simple_broadcast,
&prepare.lhs_padded_strides,
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(prepare.lhs_tensor->template Data<T>()),
@ -347,6 +350,7 @@ Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) {
break;
case on::TensorProto_DataType_FLOAT:
ImplT1_Pow<typename ToCudaType<T>::MappedType, typename ToCudaType<float>::MappedType>(
stream,
prepare.output_rank_or_simple_broadcast,
&prepare.lhs_padded_strides,
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(prepare.lhs_tensor->template Data<T>()),
@ -360,6 +364,7 @@ Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) {
break;
case on::TensorProto_DataType_DOUBLE:
ImplT1_Pow<typename ToCudaType<T>::MappedType, typename ToCudaType<double>::MappedType>(
stream,
prepare.output_rank_or_simple_broadcast,
&prepare.lhs_padded_strides,
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(prepare.lhs_tensor->template Data<T>()),
@ -373,6 +378,7 @@ Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) {
break;
case on::TensorProto_DataType_FLOAT16:
ImplT1_Pow<typename ToCudaType<T>::MappedType, typename ToCudaType<MLFloat16>::MappedType>(
stream,
prepare.output_rank_or_simple_broadcast,
&prepare.lhs_padded_strides,
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(prepare.lhs_tensor->template Data<T>()),
@ -402,19 +408,19 @@ Status Pow::ComputeInternal(OpKernelContext* context) const {
switch (prepare.lhs_tensor->GetElementType()) {
case on::TensorProto_DataType_INT32:
s = DispatchOnFirstArg<int32_t>(prepare);
s = DispatchOnFirstArg<int32_t>(Stream(), prepare);
break;
case on::TensorProto_DataType_INT64:
s = DispatchOnFirstArg<int64_t>(prepare);
s = DispatchOnFirstArg<int64_t>(Stream(), prepare);
break;
case on::TensorProto_DataType_FLOAT:
s = DispatchOnFirstArg<float>(prepare);
s = DispatchOnFirstArg<float>(Stream(), prepare);
break;
case on::TensorProto_DataType_DOUBLE:
s = DispatchOnFirstArg<double>(prepare);
s = DispatchOnFirstArg<double>(Stream(), prepare);
break;
case on::TensorProto_DataType_FLOAT16:
s = DispatchOnFirstArg<MLFloat16>(prepare);
s = DispatchOnFirstArg<MLFloat16>(Stream(), prepare);
break;
default:
s = ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported X type: ",
@ -431,6 +437,7 @@ Status CompareFunction<T, CudaT>::CompareMethod(OpKernelContext* context, ImplCo
ORT_RETURN_IF_ERROR(Prepare(context, &prepare));
Impl_Compare(
Stream(),
prepare.output_rank_or_simple_broadcast,
&prepare.lhs_padded_strides,
reinterpret_cast<const CudaT*>(prepare.lhs_tensor->template Data<T>()),

View file

@ -219,7 +219,8 @@ class CompareFunction : public BinaryElementwise<ShouldBroadcast> {
public:
CompareFunction(const OpKernelInfo& info) : BinaryElementwise(info) {}
typedef void (*ImplCompare)(int32_t output_rank_or_simple_broadcast,
typedef void (*ImplCompare)(cudaStream_t stream,
int32_t output_rank_or_simple_broadcast,
const TArray<int64_t>* lhs_padded_strides,
const CudaT* lhs_data,
const TArray<int64_t>* rhs_padded_strides,

View file

@ -12,7 +12,8 @@ namespace cuda {
#define BINARY_ELEMENTWISE_IMPL(name) \
BINARY_ELEMENTWISE_IMPL_DECLARATION(name) { \
BinaryElementWiseImpl(output_rank_or_simple_broadcast, \
BinaryElementWiseImpl(stream, \
output_rank_or_simple_broadcast, \
lhs_padded_strides, \
lhs_data, \
rhs_padded_strides, \
@ -27,7 +28,8 @@ namespace cuda {
#define BINARY_ELEMENTWISE_IMPL_T1(name) \
BINARY_ELEMENTWISE_IMPL_DECLARATION_T1(name) { \
BinaryElementWiseImpl(output_rank_or_simple_broadcast, \
BinaryElementWiseImpl(stream, \
output_rank_or_simple_broadcast, \
lhs_padded_strides, \
lhs_data, \
rhs_padded_strides, \
@ -42,7 +44,8 @@ namespace cuda {
#define BINARY_ELEMENTWISE_IMPL_T2(name) \
BINARY_ELEMENTWISE_IMPL_DECLARATION_T2(name) { \
BinaryElementWiseImpl(output_rank_or_simple_broadcast, \
BinaryElementWiseImpl(stream, \
output_rank_or_simple_broadcast, \
lhs_padded_strides, \
lhs_data, \
rhs_padded_strides, \
@ -56,19 +59,22 @@ namespace cuda {
}
#define SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, T) \
template void Impl_##x<T>(int32_t output_rank, \
template void Impl_##x<T>(cudaStream_t stream, \
int32_t output_rank, \
const TArray<int64_t>* lhs_padded_strides, const T* lhs_data, \
const TArray<int64_t>* rhs_padded_strides, const T* rhs_data, \
const TArray<fast_divmod>* fdm_output_strides, const fast_divmod& fdm_H, const fast_divmod& fdm_C, T* output_data, size_t count);
#define SPECIALIZED_BINARY_ELEMENTWISE_IMPL_T1(x, T, T1) \
template void ImplT1_##x<T, T1>(int32_t output_rank, \
template void ImplT1_##x<T, T1>(cudaStream_t stream, \
int32_t output_rank, \
const TArray<int64_t>* lhs_padded_strides, const T* lhs_data, \
const TArray<int64_t>* rhs_padded_strides, const T1* rhs_data, \
const TArray<fast_divmod>* fdm_output_strides, const fast_divmod& fdm_H, const fast_divmod& fdm_C, T* output_data, size_t count);
#define SPECIALIZED_BINARY_ELEMENTWISE_IMPL_T2(x, T, T1, T2) \
template void ImplT2_##x<T, T1, T2>(int32_t output_rank, \
template void ImplT2_##x<T, T1, T2>(cudaStream_t stream, \
int32_t output_rank, \
const TArray<int64_t>* lhs_padded_strides, const T1* lhs_data, \
const TArray<int64_t>* rhs_padded_strides, const T2* rhs_data, \
const TArray<fast_divmod>* fdm_output_strides, const fast_divmod& fdm_H, const fast_divmod& fdm_C, T* output_data, size_t count);

View file

@ -34,6 +34,7 @@ namespace cuda {
#define BINARY_ELEMENTWISE_IMPL_DECLARATION(name) \
template <typename T> \
void Impl_##name( \
cudaStream_t stream, \
int32_t output_rank_or_simple_broadcast, \
const TArray<int64_t>* lhs_padded_strides, \
const T* lhs_data, \
@ -52,6 +53,7 @@ BINARY_OPS()
#define BINARY_ELEMENTWISE_IMPL_DECLARATION_T1(name) \
template <typename T, typename T1> \
void ImplT1_##name( \
cudaStream_t stream, \
int32_t output_rank_or_simple_broadcast, \
const TArray<int64_t>* lhs_padded_strides, \
const T* lhs_data, \
@ -68,6 +70,7 @@ BINARY_ELEMENTWISE_IMPL_DECLARATION_T1(Pow);
#define BINARY_ELEMENTWISE_IMPL_DECLARATION_T2(name) \
template <typename T, typename T1, typename T2> \
void ImplT2_##name( \
cudaStream_t stream, \
int32_t output_rank_or_simple_broadcast, \
const TArray<int64_t>* lhs_padded_strides, \
const T1* lhs_data, \

View file

@ -62,7 +62,7 @@ Status Clip_6<T>::ComputeInternal(OpKernelContext* ctx) const {
if (count > 0) {
auto* y_data = Y->template MutableData<T>();
const auto* x_data = X.template Data<T>();
ClipImpl<T>(x_data, y_data, this->min_, this->max_, count);
ClipImpl<T>(Stream(), x_data, y_data, this->min_, this->max_, count);
}
return Status::OK();
}
@ -91,7 +91,7 @@ struct LowMax<MLFloat16> {
template <typename T>
struct Clip::ComputeImpl {
void operator()(const Tensor* X, const Tensor* min, const Tensor* max, Tensor* Y) const {
void operator()(cudaStream_t stream, const Tensor* X, const Tensor* min, const Tensor* max, Tensor* Y) const {
auto min_val = clip_internal::LowMax<T>::low();
auto max_val = clip_internal::LowMax<T>::max();
@ -110,7 +110,7 @@ struct Clip::ComputeImpl {
if (count > 0) {
auto* y_data = Y->template MutableData<T>();
const auto* x_data = X->template Data<T>();
ClipImpl<T>(x_data, y_data, min_val, max_val, count);
ClipImpl<T>(stream, x_data, y_data, min_val, max_val, count);
}
}
};
@ -124,7 +124,7 @@ Status Clip::ComputeInternal(OpKernelContext* ctx) const {
utils::MLTypeCallDispatcher<ComputeImpl, float, double, int8_t, uint8_t, int64_t, uint64_t>
t_disp(X->GetElementType());
t_disp.Invoke(X, min, max, Y);
t_disp.Invoke(Stream(), X, min, max, Y);
return Status::OK();
}

View file

@ -13,24 +13,24 @@ __global__ void _Clip(const T* input, T* output, T min, T max, size_t N) {
}
template <typename T>
void ClipImpl(const T* input_data, T* output_data, T min, T max, size_t count) {
void ClipImpl(cudaStream_t stream, const T* input_data, T* output_data, T min, T max, size_t count) {
typedef typename ToCudaType<T>::MappedType CudaT;
int blocksPerGrid = (int)(ceil(static_cast<float>(count) / GridDim::maxThreadsPerBlock));
_Clip<CudaT><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(reinterpret_cast<const CudaT*>(input_data),
_Clip<CudaT><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(reinterpret_cast<const CudaT*>(input_data),
reinterpret_cast<CudaT*>(output_data),
*reinterpret_cast<CudaT*>(&min),
*reinterpret_cast<CudaT*>(&max),
count);
}
template void ClipImpl<float>(const float* input_data, float* output_data, float min, float max, size_t count);
template void ClipImpl<double>(const double* input_data, double* output_data, double min, double max, size_t count);
template void ClipImpl<MLFloat16>(const MLFloat16* input_data, MLFloat16* output_data, MLFloat16 min, MLFloat16 max, size_t count);
template void ClipImpl<int8_t>(const int8_t* input_data, int8_t* output_data, int8_t min, int8_t max, size_t count);
template void ClipImpl<uint8_t>(const uint8_t* input_data, uint8_t* output_data, uint8_t min, uint8_t max, size_t count);
template void ClipImpl<int64_t>(const int64_t* input_data, int64_t* output_data, int64_t min, int64_t max, size_t count);
template void ClipImpl<uint64_t>(const uint64_t* input_data, uint64_t* output_data, uint64_t min, uint64_t max, size_t count);
template void ClipImpl<float>(cudaStream_t stream, const float* input_data, float* output_data, float min, float max, size_t count);
template void ClipImpl<double>(cudaStream_t stream, const double* input_data, double* output_data, double min, double max, size_t count);
template void ClipImpl<MLFloat16>(cudaStream_t stream, const MLFloat16* input_data, MLFloat16* output_data, MLFloat16 min, MLFloat16 max, size_t count);
template void ClipImpl<int8_t>(cudaStream_t stream, const int8_t* input_data, int8_t* output_data, int8_t min, int8_t max, size_t count);
template void ClipImpl<uint8_t>(cudaStream_t stream, const uint8_t* input_data, uint8_t* output_data, uint8_t min, uint8_t max, size_t count);
template void ClipImpl<int64_t>(cudaStream_t stream, const int64_t* input_data, int64_t* output_data, int64_t min, int64_t max, size_t count);
template void ClipImpl<uint64_t>(cudaStream_t stream, const uint64_t* input_data, uint64_t* output_data, uint64_t min, uint64_t max, size_t count);
} // namespace cuda
} // namespace onnxruntime

View file

@ -10,7 +10,7 @@
namespace onnxruntime {
namespace cuda {
template <typename T>
void ClipImpl(const T* input_data, T* output_data, T min, T max, size_t count);
void ClipImpl(cudaStream_t stream, const T* input_data, T* output_data, T min, T max, size_t count);
} // namespace cuda
} // namespace onnxruntime

View file

@ -77,7 +77,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const {
fast_divmod fast_divmod_input_stride_along_axis(static_cast<int>(input_stride_along_axis));
if (input->IsDataType<float>()) {
CumSumImpl(reinterpret_cast<const typename ToCudaType<float>::MappedType*>(input->Data<float>()),
CumSumImpl(Stream(), reinterpret_cast<const typename ToCudaType<float>::MappedType*>(input->Data<float>()),
fast_divmod_input_dim_along_axis,
fast_divmod_input_stride_along_axis,
reinterpret_cast<typename ToCudaType<float>::MappedType*>(output.MutableData<float>()),
@ -85,7 +85,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const {
exclusive_,
reverse_);
} else if (input->IsDataType<double>()) {
CumSumImpl(reinterpret_cast<const typename ToCudaType<double>::MappedType*>(input->Data<double>()),
CumSumImpl(Stream(), reinterpret_cast<const typename ToCudaType<double>::MappedType*>(input->Data<double>()),
fast_divmod_input_dim_along_axis,
fast_divmod_input_stride_along_axis,
reinterpret_cast<typename ToCudaType<double>::MappedType*>(output.MutableData<double>()),
@ -93,7 +93,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const {
exclusive_,
reverse_);
} else if (input->IsDataType<int32_t>()) {
CumSumImpl(reinterpret_cast<const typename ToCudaType<int32_t>::MappedType*>(input->Data<int32_t>()),
CumSumImpl(Stream(), reinterpret_cast<const typename ToCudaType<int32_t>::MappedType*>(input->Data<int32_t>()),
fast_divmod_input_dim_along_axis,
fast_divmod_input_stride_along_axis,
reinterpret_cast<typename ToCudaType<int32_t>::MappedType*>(output.MutableData<int32_t>()),
@ -101,7 +101,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const {
exclusive_,
reverse_);
} else if (input->IsDataType<int64_t>()) {
CumSumImpl(reinterpret_cast<const typename ToCudaType<int64_t>::MappedType*>(input->Data<int64_t>()),
CumSumImpl(Stream(), reinterpret_cast<const typename ToCudaType<int64_t>::MappedType*>(input->Data<int64_t>()),
fast_divmod_input_dim_along_axis,
fast_divmod_input_stride_along_axis,
reinterpret_cast<typename ToCudaType<int64_t>::MappedType*>(output.MutableData<int64_t>()),
@ -109,7 +109,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const {
exclusive_,
reverse_);
} else if (input->IsDataType<uint32_t>()) {
CumSumImpl(reinterpret_cast<const typename ToCudaType<uint32_t>::MappedType*>(input->Data<uint32_t>()),
CumSumImpl(Stream(), reinterpret_cast<const typename ToCudaType<uint32_t>::MappedType*>(input->Data<uint32_t>()),
fast_divmod_input_dim_along_axis,
fast_divmod_input_stride_along_axis,
reinterpret_cast<typename ToCudaType<uint32_t>::MappedType*>(output.MutableData<uint32_t>()),
@ -117,7 +117,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const {
exclusive_,
reverse_);
} else if (input->IsDataType<uint64_t>()) {
CumSumImpl(reinterpret_cast<const typename ToCudaType<uint64_t>::MappedType*>(input->Data<uint64_t>()),
CumSumImpl(Stream(), reinterpret_cast<const typename ToCudaType<uint64_t>::MappedType*>(input->Data<uint64_t>()),
fast_divmod_input_dim_along_axis,
fast_divmod_input_stride_along_axis,
reinterpret_cast<typename ToCudaType<uint64_t>::MappedType*>(output.MutableData<uint64_t>()),
@ -125,7 +125,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const {
exclusive_,
reverse_);
} else if (input->IsDataType<MLFloat16>()) {
CumSumImpl(reinterpret_cast<const typename ToCudaType<MLFloat16>::MappedType*>(input->Data<MLFloat16>()),
CumSumImpl(Stream(), reinterpret_cast<const typename ToCudaType<MLFloat16>::MappedType*>(input->Data<MLFloat16>()),
fast_divmod_input_dim_along_axis,
fast_divmod_input_stride_along_axis,
reinterpret_cast<typename ToCudaType<MLFloat16>::MappedType*>(output.MutableData<MLFloat16>()),

View file

@ -71,6 +71,7 @@ __global__ void _CumSumKernel(
template <typename T>
void CumSumImpl(
cudaStream_t stream,
const T* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,
@ -81,7 +82,7 @@ void CumSumImpl(
if (output_size > 0) {
int blocksPerGrid = static_cast<int>((output_size + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock);
_CumSumKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(input_data,
_CumSumKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(input_data,
input_dim_along_axis,
input_stride_along_axis,
output_data,
@ -92,6 +93,7 @@ void CumSumImpl(
}
template void CumSumImpl<int32_t>(
cudaStream_t stream,
const int32_t* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,
@ -101,6 +103,7 @@ template void CumSumImpl<int32_t>(
bool reverse);
template void CumSumImpl<int64_t>(
cudaStream_t stream,
const int64_t* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,
@ -110,6 +113,7 @@ template void CumSumImpl<int64_t>(
bool reverse);
template void CumSumImpl<uint32_t>(
cudaStream_t stream,
const uint32_t* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,
@ -119,6 +123,7 @@ template void CumSumImpl<uint32_t>(
bool reverse);
template void CumSumImpl<uint64_t>(
cudaStream_t stream,
const uint64_t* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,
@ -128,6 +133,7 @@ template void CumSumImpl<uint64_t>(
bool reverse);
template void CumSumImpl<float>(
cudaStream_t stream,
const float* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,
@ -137,6 +143,7 @@ template void CumSumImpl<float>(
bool reverse);
template void CumSumImpl<double>(
cudaStream_t stream,
const double* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,
@ -146,6 +153,7 @@ template void CumSumImpl<double>(
bool reverse);
template void CumSumImpl<half>(
cudaStream_t stream,
const half* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,

View file

@ -11,6 +11,7 @@ namespace cuda {
template <typename T>
void CumSumImpl(
cudaStream_t stream,
const T* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,

View file

@ -12,14 +12,15 @@ namespace DeviceHelpers {
namespace CudaDeviceHelpers {
// CUDA EP specific Data copy helper
Status DataCopy(const Tensor& input, Tensor& output) {
Status DataCopy(const Tensor& input, Tensor& output, void* einsum_cuda_assets) {
ORT_ENFORCE(output.SizeInBytes() == input.SizeInBytes(),
"Einsum op: The candidate output does not match the actual output's shape");
// There are no string tensors in Einsum's case - so safely use memcpy
// TODO: Currently, triggers copy on stream 0, investigate if we can still do that
// *if* the kernel is launched in a different stream
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output.MutableDataRaw(), input.DataRaw(), input.SizeInBytes(),
cudaMemcpyDeviceToDevice));
cudaMemcpyDeviceToDevice,
static_cast<cudaStream_t>(static_cast<EinsumCudaAssets*>(einsum_cuda_assets)->cuda_ep_->GetComputeStream())));
return Status::OK();
}
@ -28,6 +29,7 @@ Status DataCopy(const Tensor& input, Tensor& output) {
Status Transpose(const std::vector<size_t>& permutation, const Tensor& input,
Tensor& output, const TensorShape* input_shape_override, void* einsum_cuda_assets) {
return cuda::Transpose::DoTranspose(static_cast<EinsumCudaAssets*>(einsum_cuda_assets)->cuda_ep_->GetDeviceProp(),
static_cast<cudaStream_t>(static_cast<EinsumCudaAssets*>(einsum_cuda_assets)->cuda_ep_->GetComputeStream()),
static_cast<EinsumCudaAssets*>(einsum_cuda_assets)->cublas_handle_,
permutation, input, output, input_shape_override);
}
@ -79,7 +81,7 @@ Tensor ReduceSum(const Tensor& input, const std::vector<int64_t>& reduce_axes,
}
// CUDA EP specific Diagonal helper
std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator) {
std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator, void* einsum_cuda_assets) {
const auto& input_shape = input.Shape();
const auto& input_dims = input_shape.GetDims();
auto rank = static_cast<int64_t>(input_dims.size());
@ -117,6 +119,7 @@ std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim
}
DiagonalImpl(
static_cast<cudaStream_t>(static_cast<EinsumCudaAssets*>(einsum_cuda_assets)->cuda_ep_->GetComputeStream()),
input.DataRaw(),
input.Shape().GetDims().size(),
first_dim,

View file

@ -38,7 +38,7 @@ namespace CudaDeviceHelpers {
Status Transpose(const std::vector<size_t>& permutation, const Tensor& input,
Tensor& output, const TensorShape* input_shape_override, void* einsum_cuda_assets);
Status DataCopy(const Tensor& input, Tensor& output);
Status DataCopy(const Tensor& input, Tensor& output, void* einsum_cuda_assets);
template <typename T>
Status MatMul(const T* input_1_data, const T* input_2_data, T* output_data,
@ -52,7 +52,7 @@ Tensor ReduceSum(const Tensor& input, const std::vector<int64_t>& reduce_axes,
const TensorShape* input_shape_override,
concurrency::ThreadPool* /*tp*/, void* einsum_cuda_assets);
std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator);
std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator, void* einsum_cuda_assets);
} // namespace CudaDeviceHelpers

View file

@ -47,6 +47,7 @@ __global__ void _DiagonalKernel(
}
void DiagonalImpl(
cudaStream_t stream,
const void* input_data,
const int64_t input_rank,
const int64_t dim_1,
@ -61,14 +62,14 @@ void DiagonalImpl(
switch (element_size) {
case sizeof(int32_t):
_DiagonalKernel<int32_t><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
_DiagonalKernel<int32_t><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
reinterpret_cast<const ToCudaType<int32_t>::MappedType*>(input_data), input_rank, dim_1, dim_2,
input_strides, reinterpret_cast<ToCudaType<int32_t>::MappedType*>(output_data), output_strides,
output_size);
break;
case sizeof(int64_t):
_DiagonalKernel<int64_t><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
_DiagonalKernel<int64_t><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
reinterpret_cast<const ToCudaType<int64_t>::MappedType*>(input_data), input_rank, dim_1, dim_2,
input_strides, reinterpret_cast<ToCudaType<int64_t>::MappedType*>(output_data), output_strides,
output_size);

View file

@ -10,6 +10,7 @@ namespace onnxruntime {
namespace cuda {
void DiagonalImpl(
cudaStream_t stream,
const void* input_data,
const int64_t input_rank,
const int64_t dim_1,

View file

@ -86,6 +86,7 @@ Status Gemm<T>::ComputeInternal(OpKernelContext* ctx) const {
if (b_shape.Size() == 1) {
// if B is (), (1,) or (1, 1), broadcast the scalar
CUBLAS_RETURN_IF_ERROR(cublasCopyHelper(
Stream(),
CublasHandle(),
M * N,
b_data,
@ -118,7 +119,7 @@ Status Gemm<T>::ComputeInternal(OpKernelContext* ctx) const {
out_data, N, device_prop));
} else {
// B is (M, N), no broadcast needed.
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(out_data, b_data, M * N * sizeof(T), cudaMemcpyDeviceToDevice));
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(out_data, b_data, M * N * sizeof(T), cudaMemcpyDeviceToDevice, Stream()));
}
}

View file

@ -70,19 +70,20 @@ Status MatMulInteger<int8_t, int8_t>::ComputeInternal(OpKernelContext* ctx) cons
IAllocatorUniquePtr<int32_t> a_row_buf;
if (b_offset != 0) {
a_row_buf = GetScratchBuffer<int32_t>(helper.OutputShape().Size() / helper.N());
ORT_RETURN_IF_ERROR(ReduceRowSumOnMatrixA(a_ptr, a_row_buf.get(), b_offset, helper));
ORT_RETURN_IF_ERROR(ReduceRowSumOnMatrixA(Stream(), a_ptr, a_row_buf.get(), b_offset, helper));
}
IAllocatorUniquePtr<int32_t> b_col_buf;
if (a_offset != 0) {
b_col_buf = GetScratchBuffer<int32_t>(helper.OutputShape().Size() / helper.M());
ORT_RETURN_IF_ERROR(ReduceColSumOnMatrixB(b_ptr, b_col_buf.get(), a_offset, helper));
ORT_RETURN_IF_ERROR(ReduceColSumOnMatrixB(Stream(), b_ptr, b_col_buf.get(), a_offset, helper));
}
int alpha = 1;
int beta = 0;
if (a_offset != 0 || b_offset != 0) {
OffsetOutput(a_row_buf.get(),
OffsetOutput(Stream(),
a_row_buf.get(),
b_col_buf.get(),
output_ptr,
a_offset,

View file

@ -26,9 +26,9 @@ __global__ void ReduceRowSumOnMatrixAKernel(const int8_t* matrix, int32_t* row_s
}
}
Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper) {
Status ReduceRowSumOnMatrixA(cudaStream_t stream, const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper) {
for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
ReduceRowSumOnMatrixAKernel<static_cast<int>(GridDim::maxThreadsPerBlock)><<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0>>>(matrix + helper.LeftOffsets()[batch],
ReduceRowSumOnMatrixAKernel<static_cast<int>(GridDim::maxThreadsPerBlock)><<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(matrix + helper.LeftOffsets()[batch],
row_sum + batch * helper.M(),
offset,
static_cast<int>(helper.K()));
@ -54,9 +54,9 @@ __global__ void ReduceColSumOnMatrixBKernel(const int8_t* matrix, int32_t* col_s
}
}
Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper) {
Status ReduceColSumOnMatrixB(cudaStream_t stream, const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper) {
for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
ReduceColSumOnMatrixBKernel<static_cast<int>(GridDim::maxThreadsPerBlock)><<<static_cast<int>(helper.N()), GridDim::maxThreadsPerBlock, 0>>>(matrix + helper.RightOffsets()[batch],
ReduceColSumOnMatrixBKernel<static_cast<int>(GridDim::maxThreadsPerBlock)><<<static_cast<int>(helper.N()), GridDim::maxThreadsPerBlock, 0, stream>>>(matrix + helper.RightOffsets()[batch],
col_sum + batch * helper.N(),
offset,
static_cast<int32_t>(helper.K()),
@ -92,7 +92,8 @@ __global__ void ComputeOffsetOfMatrixB(const int32_t* row_sum,
}
}
Status OffsetOutput(const int32_t* row_sum,
Status OffsetOutput(cudaStream_t stream,
const int32_t* row_sum,
const int32_t* col_sum,
int32_t* output,
const int8_t a_offset,
@ -100,7 +101,7 @@ Status OffsetOutput(const int32_t* row_sum,
const MatMulComputeHelper& helper) {
if (a_offset && b_offset) {
for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
ComputeOffsetOfMatrixAB<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0>>>(
ComputeOffsetOfMatrixAB<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(
row_sum + batch * helper.M(),
col_sum + batch * helper.N(),
output + helper.OutputOffsets()[batch],
@ -109,14 +110,14 @@ Status OffsetOutput(const int32_t* row_sum,
}
} else if (a_offset) {
for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
ComputeOffsetOfMatrixA<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0>>>(
ComputeOffsetOfMatrixA<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(
col_sum + batch * helper.N(),
output + helper.OutputOffsets()[batch],
static_cast<int32_t>(helper.N()));
}
} else if (b_offset) {
for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
ComputeOffsetOfMatrixB<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0>>>(
ComputeOffsetOfMatrixB<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(
row_sum + batch * helper.M(),
output + helper.OutputOffsets()[batch],
static_cast<int32_t>(helper.N()));

View file

@ -11,9 +11,10 @@
namespace onnxruntime {
namespace cuda {
Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper);
Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper);
Status OffsetOutput(const int32_t* row_sum,
Status ReduceRowSumOnMatrixA(cudaStream_t stream, const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper);
Status ReduceColSumOnMatrixB(cudaStream_t stream, const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper);
Status OffsetOutput(cudaStream_t stream,
const int32_t* row_sum,
const int32_t* col_sum,
int32_t* output,
const int8_t a_offset,

View file

@ -13,6 +13,7 @@ namespace cuda {
template <typename T, bool is_log_softmax>
Status SoftMaxComputeHelper(
cudaStream_t stream,
const T* X,
const TensorShape& input_shape,
T* Y,
@ -28,7 +29,7 @@ Status SoftMaxComputeHelper(
// cudnnSoftmaxForward/Backward is not optimal implementation.
// TODO: remove cudnn path completely in the future.
if (D <= 1024 && D * sizeof(T) <= 4096) {
dispatch_softmax_forward<CudaT, CudaT, AccumulationType_t<CudaT>, is_log_softmax>(Y_data, X_data, gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(N));
dispatch_softmax_forward<CudaT, CudaT, AccumulationType_t<CudaT>, is_log_softmax>(stream, Y_data, X_data, gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(N));
return Status::OK();
}
@ -50,8 +51,8 @@ Status SoftMaxComputeHelper(
}
#define SPECIALIZED_SOFTMAX_HELPER_IMPL(T) \
template Status SoftMaxComputeHelper<T, false>(const T* input, const TensorShape& shape, T* Y, cudnnHandle_t handle, int64_t axis); \
template Status SoftMaxComputeHelper<T, true>(const T* input, const TensorShape& shape, T* Y, cudnnHandle_t handle, int64_t axis);
template Status SoftMaxComputeHelper<T, false>(cudaStream_t stream, const T* input, const TensorShape& shape, T* Y, cudnnHandle_t handle, int64_t axis); \
template Status SoftMaxComputeHelper<T, true>(cudaStream_t stream, const T* input, const TensorShape& shape, T* Y, cudnnHandle_t handle, int64_t axis);
SPECIALIZED_SOFTMAX_HELPER_IMPL(float)
SPECIALIZED_SOFTMAX_HELPER_IMPL(double)
@ -62,6 +63,7 @@ SPECIALIZED_SOFTMAX_HELPER_IMPL(MLFloat16)
#define SPECIALIZED_SOFTMAX_HELPER_IMPL_BFloat16(is_log_softmax) \
template <> \
Status SoftMaxComputeHelper<BFloat16, is_log_softmax>( \
cudaStream_t stream, \
const BFloat16* X, \
const TensorShape& input_shape, \
BFloat16* Y, \
@ -73,7 +75,7 @@ SPECIALIZED_SOFTMAX_HELPER_IMPL(MLFloat16)
auto Y_data = reinterpret_cast<CudaT*>(Y); \
auto X_data = reinterpret_cast<const CudaT*>(X); \
dispatch_softmax_forward<CudaT, CudaT, AccumulationType_t<CudaT>, is_log_softmax>( \
Y_data, X_data, gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(N)); \
stream, Y_data, X_data, gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(N)); \
return Status::OK(); \
}
@ -183,6 +185,7 @@ Status Softmax<T>::ComputeInternal(OpKernelContext* ctx) const {
// Perform the transpose
ORT_RETURN_IF_ERROR(Transpose::DoTranspose(cuda_ep_->GetDeviceProp(),
Stream(),
CublasHandle(),
permutation, *X, temp_input));
transposed_input = std::move(temp_input);
@ -208,11 +211,11 @@ Status Softmax<T>::ComputeInternal(OpKernelContext* ctx) const {
Status status;
if (log_softmax_) {
status = SoftMaxComputeHelper<T, true>(X_data, *compute_input_shape, Y_data, CudnnHandle(),
status = SoftMaxComputeHelper<T, true>(Stream(), X_data, *compute_input_shape, Y_data, CudnnHandle(),
is_transpose_required ? static_cast<int64_t>(rank) - 1
: static_cast<int64_t>(axis));
} else {
status = SoftMaxComputeHelper<T, false>(X_data, *compute_input_shape, Y_data, CudnnHandle(),
status = SoftMaxComputeHelper<T, false>(Stream(), X_data, *compute_input_shape, Y_data, CudnnHandle(),
is_transpose_required ? static_cast<int64_t>(rank) - 1
: static_cast<int64_t>(axis));
}
@ -227,6 +230,7 @@ Status Softmax<T>::ComputeInternal(OpKernelContext* ctx) const {
}
// Perform the transpose to get the axes back to the original ordering
ORT_RETURN_IF_ERROR(Transpose::DoTranspose(cuda_ep_->GetDeviceProp(),
Stream(),
CublasHandle(),
reverse_permutation, intermediate_output, *Y));
}

View file

@ -11,6 +11,7 @@ namespace cuda {
template <typename T, bool is_log_softmax>
Status SoftMaxComputeHelper(
cudaStream_t stream,
const T* input,
const TensorShape& shape,
T* Y,
@ -18,7 +19,7 @@ Status SoftMaxComputeHelper(
int64_t axis);
template <typename input_t, typename output_t, typename acc_t, bool is_log_softmax>
void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count);
void dispatch_softmax_forward(cudaStream_t stream, output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count);
template <typename T>
class Softmax final : public CudaKernel {

View file

@ -135,7 +135,7 @@ __global__ void softmax_warp_forward(output_t* dst, const input_t* src, int batc
}
template <typename input_t, typename output_t, typename acc_t, bool is_log_softmax>
void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count) {
void dispatch_softmax_forward(cudaStream_t stream, output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count) {
if (softmax_elements == 0) {
return;
} else {
@ -159,47 +159,47 @@ void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_ele
switch (log2_elements) {
case 0: // 1
softmax_warp_forward<input_t, output_t, acc_t, 0, is_log_softmax>
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 1: // 2
softmax_warp_forward<input_t, output_t, acc_t, 1, is_log_softmax>
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 2: // 4
softmax_warp_forward<input_t, output_t, acc_t, 2, is_log_softmax>
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 3: // 8
softmax_warp_forward<input_t, output_t, acc_t, 3, is_log_softmax>
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 4: // 16
softmax_warp_forward<input_t, output_t, acc_t, 4, is_log_softmax>
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 5: // 32
softmax_warp_forward<input_t, output_t, acc_t, 5, is_log_softmax>
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 6: // 64
softmax_warp_forward<input_t, output_t, acc_t, 6, is_log_softmax>
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 7: // 128
softmax_warp_forward<input_t, output_t, acc_t, 7, is_log_softmax>
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 8: // 256
softmax_warp_forward<input_t, output_t, acc_t, 8, is_log_softmax>
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 9: // 512
softmax_warp_forward<input_t, output_t, acc_t, 9, is_log_softmax>
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
case 10: // 1024
softmax_warp_forward<input_t, output_t, acc_t, 10, is_log_softmax>
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
break;
default:
break;
@ -208,8 +208,8 @@ void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_ele
}
#define SPECIALIZED_SOFTMAX_IMPL(input_t, output_t, acc_t) \
template void dispatch_softmax_forward<input_t, output_t, acc_t, false>(output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); \
template void dispatch_softmax_forward<input_t, output_t, acc_t, true>(output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count);
template void dispatch_softmax_forward<input_t, output_t, acc_t, false>(cudaStream_t stream, output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); \
template void dispatch_softmax_forward<input_t, output_t, acc_t, true>(cudaStream_t stream, output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count);
SPECIALIZED_SOFTMAX_IMPL(float, float, float)
SPECIALIZED_SOFTMAX_IMPL(half, half, float)

View file

@ -419,23 +419,24 @@ __global__ void ExcludeOutput(int64_t* output_i, int64_t K, int64_t dimension) {
template <typename T>
Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t* output_i, const TArray<int64_t>& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension) {
typedef typename ToCudaType<T>::MappedType CudaT;
cudaStream_t stream = kernel->Stream();
const CudaT* input_x_ptr = reinterpret_cast<const CudaT*>(input_x);
CudaT* output_v_ptr = reinterpret_cast<CudaT*>(output_v);
auto aligned_K = ALIGN(K);
auto aligned_dimension = ALIGN(dimension);
if (aligned_dimension <= GridDim::maxThreadsPerBlock) {
BitonicTopK<CudaT><<<N, GridDim::maxThreadsPerBlock, aligned_dimension * sizeof(KV<CudaT>)>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension, aligned_dimension, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
BitonicTopK<CudaT><<<N, GridDim::maxThreadsPerBlock, aligned_dimension * sizeof(KV<CudaT>), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension, aligned_dimension, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
} else if (K <= BT*16 || 0 == sorted) {
auto XPT = static_cast<int64_t>(ceil(static_cast<double>(dimension) / GridDim::maxThreadsPerBlock));
if (BT*2 >= K || 0 == sorted) {
RadixTopK<CudaT, BT, 2><<<N, BT, 256 * sizeof(uint32_t)>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
RadixTopK<CudaT, BT, 2><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
} else if (BT*4>=K) {
RadixTopK<CudaT, BT, 4><<<N, BT, 256 * sizeof(uint32_t)>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
RadixTopK<CudaT, BT, 4><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
} else if (BT*8>=K) {
RadixTopK<CudaT, BT, 8><<<N, BT, 256 * sizeof(uint32_t)>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
RadixTopK<CudaT, BT, 8><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
} else {
RadixTopK<CudaT, BT, 16><<<N, BT, 256 * sizeof(uint32_t)>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
RadixTopK<CudaT, BT, 16><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
}
} else {
auto input_key_buffer = kernel->GetScratchBuffer<CudaT>(dimension);
@ -447,21 +448,21 @@ Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t
auto* input_value = input_value_buffer.get();
auto* output_value = output_value_buffer.get();
size_t temp_bytes = 0;
CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, input_key, output_key, input_value, output_value, dimension));
CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, input_key, output_key, input_value, output_value, dimension, 0, sizeof(T)*8, stream));
auto temp_storage_buffer = kernel->GetScratchBuffer<char>(temp_bytes);
auto* temp_storage = temp_storage_buffer.get();
auto blocks_per_grid_D = (int)(ceil(static_cast<float>(dimension) / BT));
auto blocks_per_grid_K = (int)(ceil(static_cast<float>(K) / BT));
for (int64_t i = 0; i < N; i++) {
FillInput<CudaT><<<blocks_per_grid_D, BT, 0>>>(input_x_ptr, input_key, input_value, elem_nums, size, axis, K, i, dimension);
CUDA_RETURN_IF_ERROR(1 == largest ? cub::DeviceRadixSort::SortPairsDescending(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension)
: cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension));
FillInput<CudaT><<<blocks_per_grid_D, BT, 0, stream>>>(input_x_ptr, input_key, input_value, elem_nums, size, axis, K, i, dimension);
CUDA_RETURN_IF_ERROR(1 == largest ? cub::DeviceRadixSort::SortPairsDescending(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension, 0, sizeof(T)*8, stream)
: cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension, 0, sizeof(T)*8, stream));
if (1 == sorted) {
FillOutput<CudaT><<<blocks_per_grid_K, BT, 0>>>(output_key, output_value, output_v_ptr, output_i, elem_nums, size, axis, K, i, dimension);
FillOutput<CudaT><<<blocks_per_grid_K, BT, 0, stream>>>(output_key, output_value, output_v_ptr, output_i, elem_nums, size, axis, K, i, dimension);
} else { //reorder by ascending index
ExcludeOutput<<<blocks_per_grid_D, BT, 0>>>(output_value, K, dimension);
CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, output_value, input_value, output_key, input_key, dimension));
FillOutput<CudaT><<<blocks_per_grid_K, BT, 0>>>(input_key, input_value, output_v_ptr, output_i, elem_nums, size, axis, K, i, dimension);
ExcludeOutput<<<blocks_per_grid_D, BT, 0, stream>>>(output_value, K, dimension);
CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, output_value, input_value, output_key, input_key, dimension, 0, sizeof(T)*8, stream));
FillOutput<CudaT><<<blocks_per_grid_K, BT, 0, stream>>>(input_key, input_value, output_v_ptr, output_i, elem_nums, size, axis, K, i, dimension);
}
}
}

View file

@ -50,6 +50,7 @@ Status UnaryElementwise::Prepare(OpKernelContext* context, UnaryElementwisePrepa
UnaryElementwisePreparation p; \
ORT_RETURN_IF_ERROR(UnaryElementwise::Prepare(context, &p)); \
Impl_##x( \
Stream(), \
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(p.input_tensor->template Data<T>()), \
reinterpret_cast<typename ToCudaType<T>::MappedType*>(p.output_tensor->template MutableData<T>()), \
p.output_tensor->Shape().Size()); \

Some files were not shown because too many files have changed in this diff Show more