mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-16 21:00:14 +00:00
Support to allow user to specify compute stream per session (#3723)
* Support to allow user to specify compute stream per session Create computation cuda stream explicitly rather than use default legacy stream or per-thread default stream. remove some redudant cudaStreamSynchronize fix gpt2 model test failures don't use default stream in nccl either. add stream schronization in OnRunEnd() using cub::DeviceScan::InclusiveSum which can be called with stream specified. fix topK failure due to latest rebase fix tensorrt support user specified stream add user_stream support in tensorrt EP use same stream for both tensort and CUDA EP. fix ScatterND specify stream for adasum and p2p kernels. fix loop fix CApiTest.custom_op_handler fix CApiTest.varied_input_custom_op_handler change for cudaMemcpyFromSymbol improve provider options for user specified compute stream * add changes for ROCM EP * fix GatherGrad UT for ROCM EP * clean code and fix NonMaxSuppression * use default stream for ROCM now * fix CApiTest.custom_op_handler:OrtFormatCustomOpTests.ConvertOnnxModelToOrt * fix tensorrt ut: CApiTest.io_binding_cuda Co-authored-by: Weixing Zhang <wezhan@microsoft.com>
This commit is contained in:
parent
973c3917a6
commit
299ace0759
320 changed files with 1876 additions and 1109 deletions
|
|
@ -1198,9 +1198,9 @@ if (onnxruntime_USE_CUDA)
|
|||
endif()
|
||||
endif()
|
||||
endif()
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --default-stream legacy")
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
|
||||
if (NOT WIN32)
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --expt-relaxed-constexpr --compiler-options -fPIC")
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --compiler-options -fPIC")
|
||||
endif()
|
||||
# Options passed to cudafe
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=bad_friend_decl\"")
|
||||
|
|
|
|||
|
|
@ -165,6 +165,9 @@ class IExecutionProvider {
|
|||
*/
|
||||
virtual common::Status OnSessionInitializationEnd() { return Status::OK(); }
|
||||
|
||||
virtual common::Status SetComputeStream(void*) { return Status::OK(); }
|
||||
virtual void* GetComputeStream() const { return nullptr; }
|
||||
|
||||
void InsertAllocator(AllocatorPtr allocator);
|
||||
void ReplaceAllocator(AllocatorPtr allocator);
|
||||
// TODO: temparary sulotion, need to unify the interface in EP and AllocatorManager
|
||||
|
|
|
|||
|
|
@ -266,8 +266,19 @@ typedef struct OrtCUDAProviderOptions {
|
|||
size_t cuda_mem_limit; // default cuda memory limitation to maximum finite value of size_t.
|
||||
int arena_extend_strategy; // default area extend strategy to KNextPowerOfTwo.
|
||||
int do_copy_in_default_stream;
|
||||
int has_user_compute_stream;
|
||||
void* user_compute_stream;
|
||||
} OrtCUDAProviderOptions;
|
||||
|
||||
/// <summary>
|
||||
/// Options for the TensorRT provider that are passed to SessionOptionsAppendExecutionProvider_TensorRT
|
||||
/// </summary>
|
||||
typedef struct OrtTensorRTProviderOptions {
|
||||
int device_id;
|
||||
int has_user_compute_stream;
|
||||
void* user_compute_stream;
|
||||
} OrtTensorRTProviderOptions;
|
||||
|
||||
/// <summary>
|
||||
/// Options for the OpenVINO provider that are passed to SessionOptionsAppendExecutionProvider_OpenVINO
|
||||
/// </summary>
|
||||
|
|
@ -1146,6 +1157,12 @@ struct OrtApi {
|
|||
*/
|
||||
ORT_API2_STATUS(ModelMetadataGetGraphDescription, _In_ const OrtModelMetadata* model_metadata,
|
||||
_Inout_ OrtAllocator* allocator, _Outptr_ char** value);
|
||||
/**
|
||||
* Append TensorRT execution provider to the session options
|
||||
* If TensorRT is not available (due to a non TensorRT enabled build), this function will return failure.
|
||||
*/
|
||||
ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_TensorRT,
|
||||
_In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options);
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
|
|||
|
|
@ -326,6 +326,7 @@ struct SessionOptions : Base<OrtSessionOptions> {
|
|||
|
||||
SessionOptions& AppendExecutionProvider_CUDA(const OrtCUDAProviderOptions& provider_options);
|
||||
SessionOptions& AppendExecutionProvider_OpenVINO(const OrtOpenVINOProviderOptions& provider_options);
|
||||
SessionOptions& AppendExecutionProvider_TensorRT(const OrtTensorRTProviderOptions& provider_options);
|
||||
};
|
||||
|
||||
struct ModelMetadata : Base<OrtModelMetadata> {
|
||||
|
|
|
|||
|
|
@ -490,6 +490,11 @@ inline SessionOptions& SessionOptions::AppendExecutionProvider_CUDA(const OrtCUD
|
|||
return *this;
|
||||
}
|
||||
|
||||
inline SessionOptions& SessionOptions::AppendExecutionProvider_TensorRT(const OrtTensorRTProviderOptions& provider_options) {
|
||||
ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_TensorRT(p_, &provider_options));
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline SessionOptions& SessionOptions::AppendExecutionProvider_OpenVINO(const OrtOpenVINOProviderOptions& provider_options) {
|
||||
ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_OpenVINO(p_, &provider_options));
|
||||
return *this;
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ namespace cuda {
|
|||
ORT_RETURN_IF_ERROR(UnaryElementwise::Prepare(context, &p)); \
|
||||
Ctx##x func_ctx = MakeFuncCtx(); \
|
||||
Impl_##x<typename ToCudaType<T>::MappedType>( \
|
||||
Stream(), \
|
||||
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(p.input_tensor->template Data<T>()), \
|
||||
reinterpret_cast<typename ToCudaType<T>::MappedType*>(p.output_tensor->template MutableData<T>()), \
|
||||
&func_ctx, p.output_tensor->Shape().Size()); \
|
||||
|
|
|
|||
|
|
@ -45,14 +45,15 @@ struct OP_Gelu : public CtxGelu {
|
|||
|
||||
#define UNARY_ACTIVATION_IMPL(name) \
|
||||
UNARY_ACTIVATION_IMPL_DECLARATION(name) { \
|
||||
UnaryElementWiseImpl(input_data, \
|
||||
UnaryElementWiseImpl(stream, \
|
||||
input_data, \
|
||||
output_data, \
|
||||
*reinterpret_cast<const OP_##name<T>*>(func_ctx), \
|
||||
count); \
|
||||
}
|
||||
|
||||
#define SPECIALIZED_UNARY_ACTIVATION_IMPL(name, T) \
|
||||
template void Impl_##name<T>(const T* input_data, T* output_data, const Ctx##name* func_ctx, size_t count);
|
||||
template void Impl_##name<T>(cudaStream_t stream, const T* input_data, T* output_data, const Ctx##name* func_ctx, size_t count);
|
||||
|
||||
#define SPECIALIZED_UNARY_ACTIVATIONL_HFD(name) \
|
||||
SPECIALIZED_UNARY_ACTIVATION_IMPL(name, half) \
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ typedef onnxruntime::cuda::CtxNull CtxGelu;
|
|||
#define UNARY_ACTIVATION_IMPL_DECLARATION(name) \
|
||||
template <typename T> \
|
||||
void Impl_##name( \
|
||||
cudaStream_t stream, \
|
||||
const T* input_data, \
|
||||
T* output_data, \
|
||||
const Ctx##name* func_ctx, \
|
||||
|
|
|
|||
|
|
@ -88,6 +88,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
|
|||
auto temp_buffer = GetScratchBuffer<void>(workSpaceSize);
|
||||
if (!LaunchAttentionKernel(
|
||||
device_prop,
|
||||
Stream(),
|
||||
reinterpret_cast<const CudaT*>(gemm_buffer.get()),
|
||||
nullptr == mask_index ? nullptr : mask_index->template Data<int>(),
|
||||
nullptr == mask_index ? nullptr : &(mask_index->Shape().GetDims()),
|
||||
|
|
|
|||
|
|
@ -148,6 +148,7 @@ bool QkvToContext(
|
|||
|
||||
bool LaunchAttentionKernel(
|
||||
const cudaDeviceProp& prop,
|
||||
cudaStream_t stream,
|
||||
const void* input,
|
||||
const int* mask_index,
|
||||
const std::vector<int64_t>* mask_index_dims,
|
||||
|
|
@ -163,9 +164,6 @@ bool LaunchAttentionKernel(
|
|||
int past_sequence_length,
|
||||
const void* past,
|
||||
void* present) {
|
||||
// use default stream
|
||||
const cudaStream_t stream = nullptr;
|
||||
|
||||
if (element_size == 2) {
|
||||
return QkvToContext(prop, cublas, stream,
|
||||
batch_size, sequence_length, num_heads, head_size, element_size,
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ size_t GetAttentionWorkspaceSize(
|
|||
|
||||
bool LaunchAttentionKernel(
|
||||
const cudaDeviceProp& prop, // Device Properties
|
||||
cudaStream_t stream, // cuda stream
|
||||
const void* input, // Input tensor
|
||||
const int* mask_index, // Attention mask raw data or index (end position of each sequence, or end positions and start positions). NULL means no mask.
|
||||
const std::vector<int64_t>* mask_index_dims, // Mask index shape
|
||||
|
|
|
|||
|
|
@ -61,6 +61,7 @@ Status EmbedLayerNorm<T>::ComputeInternal(OpKernelContext* context) const {
|
|||
size_t element_size = sizeof(T);
|
||||
|
||||
if (!LaunchEmbedLayerNormKernel(
|
||||
Stream(),
|
||||
output->template MutableData<T>(),
|
||||
mask_index->template MutableData<int32_t>(),
|
||||
input_ids->template Data<int32_t>(),
|
||||
|
|
|
|||
|
|
@ -173,6 +173,7 @@ bool EmbedSkipLayerNorm(
|
|||
}
|
||||
|
||||
bool LaunchEmbedLayerNormKernel(
|
||||
cudaStream_t stream,
|
||||
void* output,
|
||||
void* mask_index,
|
||||
const int* input_ids,
|
||||
|
|
@ -188,10 +189,8 @@ bool LaunchEmbedLayerNormKernel(
|
|||
int batch_size,
|
||||
int sequence_length,
|
||||
const size_t element_size) {
|
||||
const cudaStream_t stream = nullptr; // default stream
|
||||
|
||||
if (nullptr == input_mask) {
|
||||
if (!CUDA_CALL(cudaMemsetAsync(mask_index, 0, sizeof(int) * batch_size)))
|
||||
if (!CUDA_CALL(cudaMemsetAsync(mask_index, 0, sizeof(int) * batch_size, stream)))
|
||||
return false;
|
||||
} else if (!ComputeMaskIndex(stream, sequence_length, batch_size, input_mask, static_cast<int*>(mask_index))) {
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -6,7 +6,8 @@ namespace onnxruntime {
|
|||
namespace contrib {
|
||||
namespace cuda {
|
||||
|
||||
bool LaunchEmbedLayerNormKernel(void* output, // output tensor
|
||||
bool LaunchEmbedLayerNormKernel(cudaStream_t stream,
|
||||
void* output, // output tensor
|
||||
void* mask_index, // output mask index
|
||||
const int* input_ids, // input word IDs
|
||||
const int* segment_ids, // input segment IDs
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@ Status FastGelu<T>::ComputeInternal(OpKernelContext* context) const {
|
|||
int64_t bias_length = (nullptr == bias) ? 0 : bias->Shape().Size();
|
||||
typedef typename ToCudaType<T>::MappedType CudaT;
|
||||
if (!LaunchFastGeluKernel<CudaT>(GetDeviceProp(),
|
||||
nullptr,
|
||||
Stream(),
|
||||
static_cast<int>(input_length),
|
||||
static_cast<int>(bias_length),
|
||||
reinterpret_cast<const CudaT*>(input->template Data<T>()),
|
||||
|
|
|
|||
|
|
@ -111,6 +111,7 @@ Status LongformerAttention<T>::ComputeInternal(OpKernelContext* context) const {
|
|||
auto workspace_buffer = GetScratchBuffer<void>(workSpaceSize);
|
||||
if (!LaunchLongformerAttentionKernel(
|
||||
device_prop,
|
||||
Stream(),
|
||||
reinterpret_cast<const CudaT*>(gemm_buffer.get()),
|
||||
reinterpret_cast<const CudaT*>(mask->template Data<T>()),
|
||||
reinterpret_cast<const CudaT*>(global_gemm_buffer.get()),
|
||||
|
|
|
|||
|
|
@ -814,6 +814,7 @@ bool LongformerQkvToContext(
|
|||
|
||||
bool LaunchLongformerAttentionKernel(
|
||||
const cudaDeviceProp& prop,
|
||||
cudaStream_t stream,
|
||||
const void* input,
|
||||
const void* attention_mask,
|
||||
const void* global_input,
|
||||
|
|
@ -828,9 +829,6 @@ bool LaunchLongformerAttentionKernel(
|
|||
void* workspace,
|
||||
cublasHandle_t& cublas,
|
||||
const size_t element_size) {
|
||||
// use default stream
|
||||
const cudaStream_t stream = nullptr;
|
||||
|
||||
if (element_size == 2) {
|
||||
return LongformerQkvToContext(prop, cublas, stream,
|
||||
batch_size, sequence_length, num_heads, head_size, window, element_size,
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ size_t GetLongformerAttentionWorkspaceSize(
|
|||
|
||||
bool LaunchLongformerAttentionKernel(
|
||||
const cudaDeviceProp& device_prop, // Device Properties
|
||||
cudaStream_t stream, // CUDA stream
|
||||
const void* input, // Input tensor
|
||||
const void* attention_mask, // Attention mask with shape (B, S)
|
||||
const void* global_input, // Global attention input, or nullptr when max_num_global == 0.
|
||||
|
|
|
|||
|
|
@ -93,6 +93,7 @@ Status SkipLayerNorm<T>::ComputeInternal(OpKernelContext* ctx) const {
|
|||
size_t element_size = sizeof(T);
|
||||
|
||||
if (!LaunchSkipLayerNormKernel(
|
||||
Stream(),
|
||||
output->template MutableData<T>(),
|
||||
input->template Data<T>(),
|
||||
skip->template Data<T>(),
|
||||
|
|
|
|||
|
|
@ -100,6 +100,7 @@ bool ComputeSkipLayerNorm(
|
|||
}
|
||||
|
||||
bool LaunchSkipLayerNormKernel(
|
||||
cudaStream_t stream,
|
||||
void* output,
|
||||
const void* input,
|
||||
const void* skip,
|
||||
|
|
@ -110,9 +111,6 @@ bool LaunchSkipLayerNormKernel(
|
|||
int hidden_size,
|
||||
int element_count,
|
||||
size_t element_size) {
|
||||
// use default stream
|
||||
const cudaStream_t stream = nullptr;
|
||||
|
||||
if (element_size == 2) {
|
||||
return ComputeSkipLayerNorm(
|
||||
stream,
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ namespace contrib {
|
|||
namespace cuda {
|
||||
|
||||
bool LaunchSkipLayerNormKernel(
|
||||
cudaStream_t stream,
|
||||
void* output, // output tensor
|
||||
const void* input, // input tensor
|
||||
const void* skip, // skip tensor
|
||||
|
|
|
|||
|
|
@ -90,7 +90,7 @@ class FusedConv : public onnxruntime::cuda::Conv<T> {
|
|||
Base::s_.y_data, beta, Base::s_.y_tensor, Base::s_.y_data));
|
||||
}
|
||||
if (Base::s_.post_slicing_required) {
|
||||
onnxruntime::cuda::SliceOutUnwantedOutputSection(Base::s_.y_data, Base::s_.y_dims_with_adjusted_pads, Base::s_.Y->MutableDataRaw(),
|
||||
onnxruntime::cuda::SliceOutUnwantedOutputSection(this->Stream(), Base::s_.y_data, Base::s_.y_dims_with_adjusted_pads, Base::s_.Y->MutableDataRaw(),
|
||||
Base::s_.y_dims, Base::s_.slice_starts, Base::s_.slice_ends, Base::s_.slice_axes, Base::s_.element_size);
|
||||
}
|
||||
return Status::OK();
|
||||
|
|
|
|||
|
|
@ -35,22 +35,24 @@ ONNX_OPERATOR_KERNEL_EX(
|
|||
namespace inverse_internal {
|
||||
|
||||
template <typename T>
|
||||
Status ComputeMatrixOffsets(T* workspace_data, size_t num_batches, size_t rows, IAllocatorUniquePtr<T*>& matrix_ptrs) {
|
||||
Status ComputeMatrixOffsets(cudaStream_t stream, T* workspace_data, size_t num_batches, size_t rows, IAllocatorUniquePtr<T*>& matrix_ptrs) {
|
||||
std::vector<T*> cuda_ptrs;
|
||||
const size_t matrix_size = rows * rows;
|
||||
for (size_t i = 0; i < num_batches; ++i) {
|
||||
cuda_ptrs.push_back(workspace_data);
|
||||
workspace_data += matrix_size;
|
||||
}
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpy(matrix_ptrs.get(), cuda_ptrs.data(), sizeof(T*) * num_batches,
|
||||
cudaMemcpyHostToDevice));
|
||||
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(matrix_ptrs.get(), cuda_ptrs.data(), sizeof(T*) * num_batches,
|
||||
cudaMemcpyHostToDevice, stream));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status CheckForSingularity(const IAllocatorUniquePtr<int>& info, const std::unique_ptr<int[]>& info_cpu, size_t num_batches) {
|
||||
Status CheckForSingularity(cudaStream_t stream, const IAllocatorUniquePtr<int>& info, const std::unique_ptr<int[]>& info_cpu, size_t num_batches) {
|
||||
// Let's check if any of the info values is non-zero
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpy(info_cpu.get(), info.get(), sizeof(int) * num_batches,
|
||||
cudaMemcpyDeviceToHost));
|
||||
// cudaMemcpyAsync from device memory to pageable host memory will return only once the copy has completed.
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(info_cpu.get(), info.get(), sizeof(int) * num_batches,
|
||||
cudaMemcpyDeviceToHost, stream));
|
||||
for (size_t i = 0; i < num_batches; ++i) {
|
||||
if (info_cpu[i] != 0) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Matrix is singular at batch:", i);
|
||||
|
|
@ -63,7 +65,7 @@ Status CheckForSingularity(const IAllocatorUniquePtr<int>& info, const std::uniq
|
|||
|
||||
template <typename T>
|
||||
struct Inverse::ComputeImpl {
|
||||
Status operator()(Inverse::CublasHandle cublas_h, const Inverse* inst, const Tensor& input, Tensor& output,
|
||||
Status operator()(cudaStream_t stream, Inverse::CublasHandle cublas_h, const Inverse* inst, const Tensor& input, Tensor& output,
|
||||
const IAllocatorUniquePtr<int>& info, const IAllocatorUniquePtr<int>& pivots,
|
||||
size_t num_batches, size_t rows) const {
|
||||
using namespace onnxruntime::cuda;
|
||||
|
|
@ -79,52 +81,52 @@ struct Inverse::ComputeImpl {
|
|||
IAllocatorUniquePtr<float> input_workspace = inst->GetScratchBuffer<float>(input_count);
|
||||
if (std::is_same<T, MLFloat16>::value) {
|
||||
// Convert from MLFloat16(half) to float
|
||||
Impl_Cast<CudaT, float>(reinterpret_cast<const CudaT*>(input.Data<MLFloat16>()), input_workspace.get(), input_count);
|
||||
Impl_Cast<CudaT, float>(stream, reinterpret_cast<const CudaT*>(input.Data<MLFloat16>()), input_workspace.get(), input_count);
|
||||
} else {
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpy(input_workspace.get(), input.Data<float>(), sizeof(float) * input_count,
|
||||
cudaMemcpyDeviceToDevice));
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_workspace.get(), input.Data<float>(), sizeof(float) * input_count,
|
||||
cudaMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
IAllocatorUniquePtr<float*> matrix_ptrs = inst->GetScratchBuffer<float*>(n_batches);
|
||||
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<float>(input_workspace.get(), num_batches, rows, matrix_ptrs));
|
||||
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<float>(stream, input_workspace.get(), num_batches, rows, matrix_ptrs));
|
||||
// Do LU factorization
|
||||
CUBLAS_RETURN_IF_ERROR(cublasSgetrfBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), info.get(), n_batches));
|
||||
ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches));
|
||||
ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches));
|
||||
|
||||
// Need to compute ptrs for output buffers
|
||||
// Output for MLFloat
|
||||
IAllocatorUniquePtr<float*> output_ptrs = inst->GetScratchBuffer<float*>(n_batches);
|
||||
if (std::is_same<T, MLFloat16>::value) {
|
||||
IAllocatorUniquePtr<float> ml_float_output = inst->GetScratchBuffer<float>(input_count);
|
||||
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<float>(ml_float_output.get(), num_batches, rows, output_ptrs));
|
||||
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<float>(stream, ml_float_output.get(), num_batches, rows, output_ptrs));
|
||||
// Do the inverse
|
||||
CUBLAS_RETURN_IF_ERROR(cublasSgetriBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), output_ptrs.get(), dim, info.get(), n_batches));
|
||||
ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches));
|
||||
ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches));
|
||||
// Copy the result to output with casting
|
||||
Impl_Cast<float, CudaT>(ml_float_output.get(), reinterpret_cast<CudaT*>(output.MutableData<MLFloat16>()), input_count);
|
||||
Impl_Cast<float, CudaT>(stream, ml_float_output.get(), reinterpret_cast<CudaT*>(output.MutableData<MLFloat16>()), input_count);
|
||||
// We are done here
|
||||
} else {
|
||||
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<float>(output.MutableData<float>(), num_batches, rows, output_ptrs));
|
||||
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<float>(stream, output.MutableData<float>(), num_batches, rows, output_ptrs));
|
||||
// Do the inverse
|
||||
CUBLAS_RETURN_IF_ERROR(cublasSgetriBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), output_ptrs.get(), dim, info.get(), n_batches));
|
||||
ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches));
|
||||
ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches));
|
||||
// We are done here
|
||||
}
|
||||
} else if (std::is_same<T, double>::value) {
|
||||
IAllocatorUniquePtr<double> input_workspace = inst->GetScratchBuffer<double>(static_cast<int>(input_count));
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpy(input_workspace.get(), input.Data<double>(), sizeof(double) * input_count,
|
||||
cudaMemcpyDeviceToDevice));
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_workspace.get(), input.Data<double>(), sizeof(double) * input_count,
|
||||
cudaMemcpyDeviceToDevice, stream));
|
||||
|
||||
IAllocatorUniquePtr<double*> matrix_ptrs = inst->GetScratchBuffer<double*>(n_batches);
|
||||
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<double>(input_workspace.get(), num_batches, rows, matrix_ptrs));
|
||||
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<double>(stream, input_workspace.get(), num_batches, rows, matrix_ptrs));
|
||||
// Do LU factorization
|
||||
CUBLAS_RETURN_IF_ERROR(cublasDgetrfBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), info.get(), n_batches));
|
||||
ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches));
|
||||
ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches));
|
||||
|
||||
// Need to compute ptrs for output buffers
|
||||
IAllocatorUniquePtr<double*> output_ptrs = inst->GetScratchBuffer<double*>(n_batches);
|
||||
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<double>(output.MutableData<double>(), num_batches, rows, output_ptrs));
|
||||
ORT_RETURN_IF_ERROR(ComputeMatrixOffsets<double>(stream, output.MutableData<double>(), num_batches, rows, output_ptrs));
|
||||
CUBLAS_RETURN_IF_ERROR(cublasDgetriBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), output_ptrs.get(), dim, info.get(), n_batches));
|
||||
ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches));
|
||||
ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches));
|
||||
// We are done here
|
||||
} else {
|
||||
ORT_THROW("Type is not supported");
|
||||
|
|
@ -148,11 +150,11 @@ Status Inverse::ComputeInternal(OpKernelContext* ctx) const {
|
|||
}
|
||||
|
||||
IAllocatorUniquePtr<int> info = GetScratchBuffer<int>(num_batches);
|
||||
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(info.get(), 0, num_batches));
|
||||
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(info.get(), 0, num_batches, Stream()));
|
||||
IAllocatorUniquePtr<int> pivots = GetScratchBuffer<int>(rows * num_batches);
|
||||
|
||||
utils::MLTypeCallDispatcherRet<Status, ComputeImpl, float, double, MLFloat16> t_disp(input->GetElementType());
|
||||
return t_disp.Invoke(Base::CublasHandle(), this, *input, *output, info, pivots, num_batches, rows);
|
||||
return t_disp.Invoke(Stream(), Base::CublasHandle(), this, *input, *output, info, pivots, num_batches, rows);
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
|
|
|
|||
|
|
@ -98,7 +98,7 @@ Status LayerNorm<T, U, simplified>::ComputeInternal(OpKernelContext* ctx) const
|
|||
inv_var_data = reinterpret_cast<CudaU*>(var->template MutableData<U>());
|
||||
}
|
||||
|
||||
HostApplyLayerNorm<CudaT, CudaU, simplified>(GetDeviceProp(), Y_data, mean_data, inv_var_data, X_data, n1, n2, epsilon_, scale_data, bias_data);
|
||||
HostApplyLayerNorm<CudaT, CudaU, simplified>(GetDeviceProp(), Stream(), Y_data, mean_data, inv_var_data, X_data, n1, n2, epsilon_, scale_data, bias_data);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -350,6 +350,7 @@ __global__ void cuApplyLayerNorm(
|
|||
template <typename T, typename U, bool simplified>
|
||||
void HostApplyLayerNorm(
|
||||
const cudaDeviceProp& prop,
|
||||
cudaStream_t stream,
|
||||
T* output,
|
||||
U* mean,
|
||||
U* invvar,
|
||||
|
|
@ -367,7 +368,7 @@ void HostApplyLayerNorm(
|
|||
const dim3 blocks(1, std::min<unsigned int>(n1, maxGridY), 1);
|
||||
int nshared =
|
||||
threads.y > 1 ? threads.y * sizeof(U) + (threads.y / 2) * sizeof(U) : 0;
|
||||
cuApplyLayerNorm<T, U, simplified><<<blocks, threads, nshared, 0>>>(
|
||||
cuApplyLayerNorm<T, U, simplified><<<blocks, threads, nshared, stream>>>(
|
||||
output,
|
||||
mean,
|
||||
invvar,
|
||||
|
|
@ -378,7 +379,7 @@ void HostApplyLayerNorm(
|
|||
}
|
||||
|
||||
#define LAYERNORM_LINEAR_IMPL(T, U, simplified) \
|
||||
template void HostApplyLayerNorm<T, U, simplified>(const cudaDeviceProp& prop, T* output, U* mean, U* invvar, const T* input, int n1, int n2, \
|
||||
template void HostApplyLayerNorm<T, U, simplified>(const cudaDeviceProp& prop, cudaStream_t stream, T* output, U* mean, U* invvar, const T* input, int n1, int n2, \
|
||||
double epsilon, const T* gamma, const T* beta);
|
||||
|
||||
LAYERNORM_LINEAR_IMPL(float, float, true)
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ namespace cuda {
|
|||
template <typename T, typename U, bool simplified>
|
||||
void HostApplyLayerNorm(
|
||||
const cudaDeviceProp& prop,
|
||||
cudaStream_t stream,
|
||||
T* output,
|
||||
U* mean,
|
||||
U* invvar,
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ namespace cuda {
|
|||
|
||||
template <typename T>
|
||||
void DispatchBiasSoftmaxForwardImpl(
|
||||
cudaStream_t stream,
|
||||
Tensor* output_tensor,
|
||||
const Tensor* input_tensor,
|
||||
const Tensor* input_bias_tensor,
|
||||
|
|
@ -25,6 +26,7 @@ void DispatchBiasSoftmaxForwardImpl(
|
|||
|
||||
template <typename T>
|
||||
void DispatchBiasSoftMaxForwardViaDnnLibraryImpl(
|
||||
cudaStream_t stream,
|
||||
cudnnHandle_t cudaDnnHandle,
|
||||
int element_count,
|
||||
int batch_count,
|
||||
|
|
@ -64,12 +66,12 @@ Status BiasSoftmax::ComputeInternal(OpKernelContext* ctx) const {
|
|||
// expect thread blocks can fill SM at high occupancy without overflowing registers
|
||||
utils::MLTypeCallDispatcher<DispatchBiasSoftmaxForward, double, float, MLFloat16>
|
||||
t_disp(X->GetElementType());
|
||||
t_disp.Invoke(Y, X, B, D, N, D, broadcast_size);
|
||||
t_disp.Invoke(Stream(), Y, X, B, D, N, D, broadcast_size);
|
||||
} else {
|
||||
// need to fallback to add kernel + CUDA DNN library softmax call :/
|
||||
utils::MLTypeCallDispatcher<DispatchBiasSoftMaxForwardViaDnnLibrary, double, float, MLFloat16>
|
||||
t_disp(X->GetElementType());
|
||||
t_disp.Invoke(CudnnHandle(), D, N, broadcast_axis, softmax_axis, X_shape, X, B_shape, B, Y);
|
||||
t_disp.Invoke(Stream(), CudnnHandle(), D, N, broadcast_axis, softmax_axis, X_shape, X, B_shape, B, Y);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
|
|
@ -77,6 +79,7 @@ Status BiasSoftmax::ComputeInternal(OpKernelContext* ctx) const {
|
|||
|
||||
template <typename T>
|
||||
void DispatchBiasSoftmaxForward<T>::operator()(
|
||||
cudaStream_t stream,
|
||||
Tensor* output,
|
||||
const Tensor* input,
|
||||
const Tensor* input_bias,
|
||||
|
|
@ -85,6 +88,7 @@ void DispatchBiasSoftmaxForward<T>::operator()(
|
|||
int batch_stride,
|
||||
int bias_broadcast_size_per_batch) {
|
||||
DispatchBiasSoftmaxForwardImpl<T>(
|
||||
stream,
|
||||
output,
|
||||
input,
|
||||
input_bias,
|
||||
|
|
@ -96,6 +100,7 @@ void DispatchBiasSoftmaxForward<T>::operator()(
|
|||
|
||||
template <typename T>
|
||||
void DispatchBiasSoftMaxForwardViaDnnLibrary<T>::operator()(
|
||||
cudaStream_t stream,
|
||||
cudnnHandle_t cudaDnnHandle,
|
||||
int element_count,
|
||||
int batch_count,
|
||||
|
|
@ -107,6 +112,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibrary<T>::operator()(
|
|||
const onnxruntime::Tensor* B,
|
||||
onnxruntime::Tensor* Y) {
|
||||
DispatchBiasSoftMaxForwardViaDnnLibraryImpl<T>(
|
||||
stream,
|
||||
cudaDnnHandle,
|
||||
element_count,
|
||||
batch_count,
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ namespace cuda {
|
|||
template <typename T>
|
||||
struct DispatchBiasSoftmaxForward {
|
||||
void operator()(
|
||||
cudaStream_t stream,
|
||||
Tensor* output,
|
||||
const Tensor* input,
|
||||
const Tensor* input_bias,
|
||||
|
|
@ -25,6 +26,7 @@ struct DispatchBiasSoftmaxForward {
|
|||
template <typename T>
|
||||
struct DispatchBiasSoftMaxForwardViaDnnLibrary {
|
||||
void operator()(
|
||||
cudaStream_t stream,
|
||||
cudnnHandle_t cudaDnnHandle,
|
||||
int element_count,
|
||||
int batch_count,
|
||||
|
|
|
|||
|
|
@ -127,6 +127,7 @@ __global__ void BiasSoftmaxWarpForward(
|
|||
|
||||
template <typename T>
|
||||
void DispatchBiasSoftmaxForwardImpl(
|
||||
cudaStream_t stream,
|
||||
Tensor* output_tensor,
|
||||
const Tensor* input_tensor,
|
||||
const Tensor* input_bias_tensor,
|
||||
|
|
@ -167,47 +168,47 @@ void DispatchBiasSoftmaxForwardImpl(
|
|||
switch (log2_elements) {
|
||||
case 0: // 1
|
||||
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 0>
|
||||
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 1: // 2
|
||||
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 1>
|
||||
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 2: // 4
|
||||
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 2>
|
||||
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 3: // 8
|
||||
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 3>
|
||||
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 4: // 16
|
||||
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 4>
|
||||
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 5: // 32
|
||||
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 5>
|
||||
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 6: // 64
|
||||
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 6>
|
||||
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 7: // 128
|
||||
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 7>
|
||||
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 8: // 256
|
||||
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 8>
|
||||
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 9: // 512
|
||||
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 9>
|
||||
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 10: // 1024
|
||||
BiasSoftmaxWarpForward<input_t, output_t, acc_t, 10>
|
||||
<<<blocks, threads, 0>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
<<<blocks, threads, 0, stream>>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
|
@ -216,6 +217,7 @@ void DispatchBiasSoftmaxForwardImpl(
|
|||
|
||||
#define SPECIALIZED_BIAS_SOFTMAX_IMPL(T) \
|
||||
template void DispatchBiasSoftmaxForwardImpl<T>( \
|
||||
cudaStream_t stream, \
|
||||
Tensor * output_tensor, \
|
||||
const Tensor* input_tensor, \
|
||||
const Tensor* input_bias_tensor, \
|
||||
|
|
@ -232,6 +234,7 @@ SPECIALIZED_BIAS_SOFTMAX_IMPL(MLFloat16)
|
|||
// note: This is an unhappy path! There is no performance benefit for the fusion.
|
||||
template <typename T>
|
||||
void DispatchBiasSoftMaxForwardViaDnnLibraryImpl(
|
||||
cudaStream_t stream,
|
||||
cudnnHandle_t cudaDnnHandle,
|
||||
int element_count,
|
||||
int batch_count,
|
||||
|
|
@ -278,6 +281,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibraryImpl(
|
|||
|
||||
// invoke elementwise add with broadcast kernel
|
||||
::onnxruntime::cuda::BinaryElementWiseImpl(
|
||||
stream,
|
||||
(int32_t)X_shape.NumDimensions(),
|
||||
&lhs_padded_strides,
|
||||
X_data,
|
||||
|
|
@ -311,6 +315,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibraryImpl(
|
|||
|
||||
#define SPECIALIZED_BIAS_SOFTMAX_IMPL_VIA_DNN(T) \
|
||||
template void DispatchBiasSoftMaxForwardViaDnnLibraryImpl<T>( \
|
||||
cudaStream_t stream, \
|
||||
cudnnHandle_t cudaDnnHandle, \
|
||||
int element_count, \
|
||||
int batch_count, \
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ namespace cuda {
|
|||
BinaryElementwisePreparation prepare; \
|
||||
ORT_RETURN_IF_ERROR(Prepare(context, &prepare)); \
|
||||
Impl_##x<typename ToCudaType<T>::MappedType>( \
|
||||
Stream(), \
|
||||
prepare.output_rank_or_simple_broadcast, \
|
||||
&prepare.lhs_padded_strides, \
|
||||
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(prepare.lhs_tensor->template Data<T>()), \
|
||||
|
|
|
|||
|
|
@ -20,7 +20,8 @@ namespace cuda {
|
|||
|
||||
#define CONTRIB_BINARY_ELEMENTWISE_IMPL(name) \
|
||||
CONTRIB_BINARY_ELEMENTWISE_IMPL_DECLARATION(name) { \
|
||||
BinaryElementWiseImpl(output_rank_or_simple_broadcast, \
|
||||
BinaryElementWiseImpl(stream, \
|
||||
output_rank_or_simple_broadcast, \
|
||||
lhs_padded_strides, \
|
||||
lhs_data, \
|
||||
rhs_padded_strides, \
|
||||
|
|
@ -34,7 +35,8 @@ namespace cuda {
|
|||
}
|
||||
|
||||
#define CONTRIB_SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, T) \
|
||||
template void Impl_##x<T>(int32_t output_rank, \
|
||||
template void Impl_##x<T>(cudaStream_t stream, \
|
||||
int32_t output_rank, \
|
||||
const TArray<int64_t>* lhs_padded_strides, \
|
||||
const T* lhs_data, \
|
||||
const TArray<int64_t>* rhs_padded_strides, \
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ namespace cuda {
|
|||
#define CONTRIB_BINARY_ELEMENTWISE_IMPL_DECLARATION(name) \
|
||||
template <typename T> \
|
||||
void Impl_##name( \
|
||||
cudaStream_t stream, \
|
||||
int32_t output_rank_or_simple_broadcast, \
|
||||
const TArray<int64_t>* lhs_padded_strides, \
|
||||
const T* lhs_data, \
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ Status ComplexMul<T, is_conj>::ComputeInternal(OpKernelContext* context) const {
|
|||
BinaryElementwisePreparation prepare;
|
||||
ORT_RETURN_IF_ERROR(Prepare(context, &prepare));
|
||||
ComplexMul_Impl<typename ToCudaType<T>::MappedType>(
|
||||
Stream(),
|
||||
prepare.output_rank_or_simple_broadcast,
|
||||
&prepare.lhs_padded_strides,
|
||||
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(prepare.lhs_tensor->template Data<T>()),
|
||||
|
|
|
|||
|
|
@ -90,6 +90,7 @@ __global__ void _ElementWiseWithStrideTwo(
|
|||
|
||||
template <typename T>
|
||||
void ComplexMul_Impl(
|
||||
cudaStream_t stream,
|
||||
int32_t output_rank_or_simple_broadcast,
|
||||
const TArray<int64_t>* lhs_padded_strides,
|
||||
const T* lhs_data,
|
||||
|
|
@ -110,7 +111,7 @@ void ComplexMul_Impl(
|
|||
CUDA_LONG N = static_cast<CUDA_LONG>(count);
|
||||
|
||||
if (lhs_padded_strides && rhs_padded_strides && lhs_padded_strides->Size() && rhs_padded_strides->Size())
|
||||
_ElementWiseWithStrideTwo<T, true, true, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
_ElementWiseWithStrideTwo<T, true, true, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
output_rank_or_simple_broadcast,
|
||||
*lhs_padded_strides,
|
||||
lhs_data,
|
||||
|
|
@ -123,7 +124,7 @@ void ComplexMul_Impl(
|
|||
rhs_size,
|
||||
is_conj);
|
||||
else if (lhs_padded_strides && lhs_padded_strides->Size())
|
||||
_ElementWiseWithStrideTwo<T, true, false, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
_ElementWiseWithStrideTwo<T, true, false, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
output_rank_or_simple_broadcast,
|
||||
*lhs_padded_strides,
|
||||
lhs_data,
|
||||
|
|
@ -136,7 +137,7 @@ void ComplexMul_Impl(
|
|||
rhs_size,
|
||||
is_conj);
|
||||
else
|
||||
_ElementWiseWithStrideTwo<T, false, true, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
_ElementWiseWithStrideTwo<T, false, true, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
output_rank_or_simple_broadcast,
|
||||
*lhs_padded_strides,
|
||||
lhs_data,
|
||||
|
|
@ -152,6 +153,7 @@ void ComplexMul_Impl(
|
|||
|
||||
#define SPECIALIZE_STACKEDCOMPLEXMUL_IMPL(T) \
|
||||
template void ComplexMul_Impl<T>( \
|
||||
cudaStream_t stream, \
|
||||
int32_t output_rank_or_simple_broadcast, \
|
||||
const TArray<int64_t>* lhs_padded_strides, \
|
||||
const T* lhs_data, \
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ using namespace ::onnxruntime::cuda;
|
|||
|
||||
template <typename T>
|
||||
void ComplexMul_Impl(
|
||||
cudaStream_t stream,
|
||||
int32_t output_rank_or_simple_broadcast,
|
||||
const TArray<int64_t>* lhs_padded_strides,
|
||||
const T* lhs_data,
|
||||
|
|
|
|||
|
|
@ -127,11 +127,11 @@ Status FFTBase<T>::DoFFT(OpKernelContext* context, const Tensor* X, bool complex
|
|||
Tensor* Y = const_cast<OpKernelContext*>(context)->Output(0, TensorShape(output_dims));
|
||||
auto* x_data = reinterpret_cast<const CudaT*>(X->template Data<T>());
|
||||
auto* y_data = reinterpret_cast<CudaT*>(Y->template MutableData<T>());
|
||||
|
||||
CUFFT_RETURN_IF_ERROR(cufftSetStream(plan_info.plan, Stream()));
|
||||
CUFFT_RETURN_IF_ERROR(cufftXtExec(plan_info.plan, const_cast<CudaT*>(x_data), y_data, inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
|
||||
|
||||
if (inverse) {
|
||||
PostProcess(signal_dims, output_size, y_data);
|
||||
PostProcess(Stream(), signal_dims, output_size, y_data);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
|
|
|
|||
|
|
@ -27,14 +27,14 @@ __global__ void _Normalize(
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
void PostProcess(const std::vector<int64_t>& signal_dims, int64_t N, T* output_data) {
|
||||
void PostProcess(cudaStream_t stream, const std::vector<int64_t>& signal_dims, int64_t N, T* output_data) {
|
||||
int64_t scale = std::accumulate(signal_dims.begin(), signal_dims.end(), 1ll, std::multiplies<int64_t>());
|
||||
int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
|
||||
_Normalize<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(output_data, N, static_cast<int>(scale));
|
||||
_Normalize<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(output_data, N, static_cast<int>(scale));
|
||||
}
|
||||
|
||||
#define SPECIALIZED_IMPL(T) \
|
||||
template void PostProcess<T>(const std::vector<int64_t>& signal_dims, int64_t N, T* output_data);
|
||||
template void PostProcess<T>(cudaStream_t stream, const std::vector<int64_t>& signal_dims, int64_t N, T* output_data);
|
||||
|
||||
SPECIALIZED_IMPL(float)
|
||||
SPECIALIZED_IMPL(double)
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ namespace contrib {
|
|||
namespace cuda {
|
||||
|
||||
template <typename T>
|
||||
void PostProcess(const std::vector<int64_t>& signal_dims, int64_t N, T* output_data);
|
||||
void PostProcess(cudaStream_t stream, const std::vector<int64_t>& signal_dims, int64_t N, T* output_data);
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace contrib
|
||||
|
|
|
|||
|
|
@ -158,6 +158,7 @@ Status QAttention<T, int8_t>::ComputeInternal(OpKernelContext* context) const {
|
|||
}
|
||||
// scale back and bias
|
||||
CudaDequantizeWithBias(
|
||||
Stream(),
|
||||
gemm_buffer_quantized.get(),
|
||||
reinterpret_cast<const CudaT*>(bias->template Data<T>()),
|
||||
reinterpret_cast<CudaT*>(gemm_buffer.get()),
|
||||
|
|
@ -172,6 +173,7 @@ Status QAttention<T, int8_t>::ComputeInternal(OpKernelContext* context) const {
|
|||
auto temp_buffer = GetScratchBuffer<void>(workSpaceSize);
|
||||
if (!LaunchAttentionKernel(
|
||||
GetDeviceProp(),
|
||||
Stream(),
|
||||
reinterpret_cast<const CudaT*>(gemm_buffer.get()),
|
||||
nullptr == mask_index ? nullptr : mask_index->template Data<int>(),
|
||||
nullptr == mask_index ? nullptr : &(mask_index->Shape().GetDims()),
|
||||
|
|
|
|||
|
|
@ -31,10 +31,10 @@ __global__ void DequantizeLinearKernel(const int32_t* quantize, const T* bias, T
|
|||
}
|
||||
|
||||
template <class T>
|
||||
Status CudaDequantizeWithBias(const int32_t* quantize, const T* bias, T* output, T scale, int m, int n) {
|
||||
Status CudaDequantizeWithBias(cudaStream_t stream, const int32_t* quantize, const T* bias, T* output, T scale, int m, int n) {
|
||||
int blocksPerGrid = static_cast<int>(CeilDiv(m * n, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
|
||||
CUDA_LONG N = static_cast<CUDA_LONG>(m * n);
|
||||
DequantizeLinearKernel<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
DequantizeLinearKernel<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
quantize,
|
||||
bias,
|
||||
output,
|
||||
|
|
@ -44,8 +44,8 @@ Status CudaDequantizeWithBias(const int32_t* quantize, const T* bias, T* output,
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
template Status CudaDequantizeWithBias<float>(const int32_t* quantize, const float* bias, float* output, float scale, int m, int n);
|
||||
template Status CudaDequantizeWithBias<half>(const int32_t* quantize, const half* bias, half* output, half scale, int m, int n);
|
||||
template Status CudaDequantizeWithBias<float>(cudaStream_t stream, const int32_t* quantize, const float* bias, float* output, float scale, int m, int n);
|
||||
template Status CudaDequantizeWithBias<half>(cudaStream_t stream, const int32_t* quantize, const half* bias, half* output, half scale, int m, int n);
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace contrib
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ namespace onnxruntime {
|
|||
namespace contrib {
|
||||
namespace cuda {
|
||||
template <class Tin>
|
||||
Status CudaDequantizeWithBias(const int32_t* quantize, const Tin* bias, Tin* output, Tin scale, int m, int n);
|
||||
Status CudaDequantizeWithBias(cudaStream_t stream, const int32_t* quantize, const Tin* bias, Tin* output, Tin scale, int m, int n);
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace contrib
|
||||
|
|
|
|||
|
|
@ -56,6 +56,7 @@ Status Crop<T>::ComputeInternal(OpKernelContext* context) const {
|
|||
fast_divmod fdm_YHW(gsl::narrow_cast<int>((bottomLimit - topBorder) * (rightLimit - leftBorder)));
|
||||
|
||||
CropImpl<CudaT>(
|
||||
Stream(),
|
||||
reinterpret_cast<const CudaT*>(X->template Data<T>()),
|
||||
gsl::narrow_cast<int>(leftBorder),
|
||||
gsl::narrow_cast<int>(topBorder),
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ __global__ void _CropKernel(
|
|||
|
||||
template <typename T>
|
||||
void CropImpl(
|
||||
cudaStream_t stream,
|
||||
const T* input_data,
|
||||
const int src_start_x,
|
||||
const int src_start_y,
|
||||
|
|
@ -41,12 +42,12 @@ void CropImpl(
|
|||
T* output_data,
|
||||
const size_t N) {
|
||||
int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
|
||||
_CropKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
_CropKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
input_data, src_start_x, src_start_y, src_w, src_hw, fdm_dst_w, fdm_dst_hw, output_data, (CUDA_LONG)N);
|
||||
}
|
||||
|
||||
#define SPECIALIZED_IMPL(T) \
|
||||
template void CropImpl<T>(const T* input_data, const int src_start_x, const int src_start_y, const int src_w, const int src_hw, const fast_divmod& fdm_dst_w, const fast_divmod& fdm_dst_hw, T* output_data, const size_t N);
|
||||
template void CropImpl<T>(cudaStream_t stream, const T* input_data, const int src_start_x, const int src_start_y, const int src_w, const int src_hw, const fast_divmod& fdm_dst_w, const fast_divmod& fdm_dst_hw, T* output_data, const size_t N);
|
||||
|
||||
SPECIALIZED_IMPL(float)
|
||||
SPECIALIZED_IMPL(double)
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ using namespace onnxruntime::cuda;
|
|||
|
||||
template <typename T>
|
||||
void CropImpl(
|
||||
cudaStream_t stream,
|
||||
const T* input_data,
|
||||
const int src_start_x,
|
||||
const int src_start_y,
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ ImageScaler<T>::ImageScaler(const OpKernelInfo& info) : CudaKernel(info) {
|
|||
ORT_ENFORCE(info.GetAttrs<float>("bias", bias_).IsOK());
|
||||
|
||||
b_data_ = GetScratchBuffer<float>(bias_.size());
|
||||
CUDA_CALL_THROW(cudaMemcpy(b_data_.get(), bias_.data(), sizeof(float) * bias_.size(), cudaMemcpyHostToDevice));
|
||||
CUDA_CALL_THROW(cudaMemcpyAsync(b_data_.get(), bias_.data(), sizeof(float) * bias_.size(), cudaMemcpyHostToDevice, Stream()));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -53,6 +53,7 @@ Status ImageScaler<T>::ComputeInternal(OpKernelContext* context) const {
|
|||
|
||||
typedef typename ToCudaType<T>::MappedType CudaT;
|
||||
ImageScalerImpl<CudaT>(
|
||||
Stream(),
|
||||
reinterpret_cast<const CudaT*>(X->template Data<T>()),
|
||||
scale_,
|
||||
b_data_.get(),
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ __global__ void _ImageScalerKernel(
|
|||
|
||||
template <typename T>
|
||||
void ImageScalerImpl(
|
||||
cudaStream_t stream,
|
||||
const T* input_data,
|
||||
const float scale,
|
||||
const float* bias_data,
|
||||
|
|
@ -40,17 +41,17 @@ void ImageScalerImpl(
|
|||
fast_divmod fdm_HW((int)(dims[2] * dims[3]));
|
||||
fast_divmod fdm_C;
|
||||
if (dims[0] == 1) {
|
||||
_ImageScalerKernel<T, true><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
_ImageScalerKernel<T, true><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
input_data, scale, bias_data, fdm_C, fdm_HW, output_data, N);
|
||||
} else {
|
||||
fdm_C = fast_divmod((int)dims[1]);
|
||||
_ImageScalerKernel<T, false><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
_ImageScalerKernel<T, false><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
input_data, scale, bias_data, fdm_C, fdm_HW, output_data, N);
|
||||
}
|
||||
}
|
||||
|
||||
#define SPECIALIZED_IMPL(T) \
|
||||
template void ImageScalerImpl<T>(const T* input_data, const float scale, const float* bias_data, const int64_t dims[4], T* output_data, const size_t N);
|
||||
template void ImageScalerImpl<T>(cudaStream_t stream, const T* input_data, const float scale, const float* bias_data, const int64_t dims[4], T* output_data, const size_t N);
|
||||
|
||||
SPECIALIZED_IMPL(float)
|
||||
SPECIALIZED_IMPL(double)
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ namespace cuda {
|
|||
|
||||
template <typename T>
|
||||
void ImageScalerImpl(
|
||||
cudaStream_t stream,
|
||||
const T* input_data,
|
||||
const float scale,
|
||||
const float* bias_data,
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ namespace rocm {
|
|||
|
||||
template <typename T>
|
||||
void DispatchBiasSoftmaxForwardImpl(
|
||||
hipStream_t stream,
|
||||
Tensor* output_tensor,
|
||||
const Tensor* input_tensor,
|
||||
const Tensor* input_bias_tensor,
|
||||
|
|
@ -25,6 +26,7 @@ void DispatchBiasSoftmaxForwardImpl(
|
|||
|
||||
template <typename T>
|
||||
void DispatchBiasSoftMaxForwardViaDnnLibraryImpl(
|
||||
hipStream_t stream,
|
||||
miopenHandle_t miopenHandle,
|
||||
int element_count,
|
||||
int batch_count,
|
||||
|
|
@ -67,12 +69,12 @@ Status BiasSoftmax::ComputeInternal(OpKernelContext* ctx) const {
|
|||
// expect thread blocks can fill SM at high occupancy without overflowing registers
|
||||
utils::MLTypeCallDispatcher<DispatchBiasSoftmaxForward, float, MLFloat16>
|
||||
t_disp(X->GetElementType());
|
||||
t_disp.Invoke(Y, X, B, D, N, D, broadcast_size);
|
||||
t_disp.Invoke(Stream(), Y, X, B, D, N, D, broadcast_size);
|
||||
} else {
|
||||
// need to fallback to add kernel + CUDA DNN library softmax call :/
|
||||
utils::MLTypeCallDispatcher<DispatchBiasSoftMaxForwardViaDnnLibrary, float, MLFloat16>
|
||||
t_disp(X->GetElementType());
|
||||
t_disp.Invoke(MiopenHandle(), D, N, broadcast_axis, softmax_axis, X_shape, X, B_shape, B, Y);
|
||||
t_disp.Invoke(Stream(), MiopenHandle(), D, N, broadcast_axis, softmax_axis, X_shape, X, B_shape, B, Y);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
|
|
@ -80,6 +82,7 @@ Status BiasSoftmax::ComputeInternal(OpKernelContext* ctx) const {
|
|||
|
||||
template <typename T>
|
||||
void DispatchBiasSoftmaxForward<T>::operator()(
|
||||
hipStream_t stream,
|
||||
Tensor* output,
|
||||
const Tensor* input,
|
||||
const Tensor* input_bias,
|
||||
|
|
@ -88,6 +91,7 @@ void DispatchBiasSoftmaxForward<T>::operator()(
|
|||
int batch_stride,
|
||||
int bias_broadcast_size_per_batch) {
|
||||
DispatchBiasSoftmaxForwardImpl<T>(
|
||||
stream,
|
||||
output,
|
||||
input,
|
||||
input_bias,
|
||||
|
|
@ -99,6 +103,7 @@ void DispatchBiasSoftmaxForward<T>::operator()(
|
|||
|
||||
template <typename T>
|
||||
void DispatchBiasSoftMaxForwardViaDnnLibrary<T>::operator()(
|
||||
hipStream_t stream,
|
||||
miopenHandle_t miopenHandle,
|
||||
int element_count,
|
||||
int batch_count,
|
||||
|
|
@ -110,6 +115,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibrary<T>::operator()(
|
|||
const onnxruntime::Tensor* B,
|
||||
onnxruntime::Tensor* Y) {
|
||||
DispatchBiasSoftMaxForwardViaDnnLibraryImpl<T>(
|
||||
stream,
|
||||
miopenHandle,
|
||||
element_count,
|
||||
batch_count,
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ namespace rocm {
|
|||
template <typename T>
|
||||
struct DispatchBiasSoftmaxForward {
|
||||
void operator()(
|
||||
hipStream_t stream,
|
||||
Tensor* output,
|
||||
const Tensor* input,
|
||||
const Tensor* input_bias,
|
||||
|
|
@ -25,6 +26,7 @@ struct DispatchBiasSoftmaxForward {
|
|||
template <typename T>
|
||||
struct DispatchBiasSoftMaxForwardViaDnnLibrary {
|
||||
void operator()(
|
||||
hipStream_t stream,
|
||||
miopenHandle_t miopenHandle,
|
||||
int element_count,
|
||||
int batch_count,
|
||||
|
|
|
|||
|
|
@ -128,6 +128,7 @@ __global__ void BiasSoftmaxWarpForward(
|
|||
|
||||
template <typename T>
|
||||
void DispatchBiasSoftmaxForwardImpl(
|
||||
hipStream_t stream,
|
||||
Tensor* output_tensor,
|
||||
const Tensor* input_tensor,
|
||||
const Tensor* input_bias_tensor,
|
||||
|
|
@ -168,47 +169,47 @@ void DispatchBiasSoftmaxForwardImpl(
|
|||
// Launch code would be more elegant if C++ supported FOR CONSTEXPR
|
||||
switch (log2_elements) {
|
||||
case 0: // 1
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 0>), dim3(blocks), dim3(threads), 0, 0,
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 0>), dim3(blocks), dim3(threads), 0, stream,
|
||||
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 1: // 2
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 1>), dim3(blocks), dim3(threads), 0, 0,
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 1>), dim3(blocks), dim3(threads), 0, stream,
|
||||
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 2: // 4
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 2>), dim3(blocks), dim3(threads), 0, 0,
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 2>), dim3(blocks), dim3(threads), 0, stream,
|
||||
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 3: // 8
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 3>), dim3(blocks), dim3(threads), 0, 0,
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 3>), dim3(blocks), dim3(threads), 0, stream,
|
||||
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 4: // 16
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 4>), dim3(blocks), dim3(threads), 0, 0,
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 4>), dim3(blocks), dim3(threads), 0, stream,
|
||||
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 5: // 32
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 5>), dim3(blocks), dim3(threads), 0, 0,
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 5>), dim3(blocks), dim3(threads), 0, stream,
|
||||
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 6: // 64
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 6>), dim3(blocks), dim3(threads), 0, 0,
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 6>), dim3(blocks), dim3(threads), 0, stream,
|
||||
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 7: // 128
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 7>), dim3(blocks), dim3(threads), 0, 0,
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 7>), dim3(blocks), dim3(threads), 0, stream,
|
||||
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 8: // 256
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 8>), dim3(blocks), dim3(threads), 0, 0,
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 8>), dim3(blocks), dim3(threads), 0, stream,
|
||||
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 9: // 512
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 9>), dim3(blocks), dim3(threads), 0, 0,
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 9>), dim3(blocks), dim3(threads), 0, stream,
|
||||
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
case 10: // 1024
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 10>), dim3(blocks), dim3(threads), 0, 0,
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward<input_t, output_t, acc_t, 10>), dim3(blocks), dim3(threads), 0, stream,
|
||||
output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch);
|
||||
break;
|
||||
default:
|
||||
|
|
@ -218,6 +219,7 @@ void DispatchBiasSoftmaxForwardImpl(
|
|||
|
||||
#define SPECIALIZED_BIAS_SOFTMAX_IMPL(T) \
|
||||
template void DispatchBiasSoftmaxForwardImpl<T>( \
|
||||
hipStream_t stream, \
|
||||
Tensor * output_tensor, \
|
||||
const Tensor* input_tensor, \
|
||||
const Tensor* input_bias_tensor, \
|
||||
|
|
@ -234,6 +236,7 @@ SPECIALIZED_BIAS_SOFTMAX_IMPL(MLFloat16)
|
|||
// note: This is an unhappy path! There is no performance benefit for the fusion.
|
||||
template <typename T>
|
||||
void DispatchBiasSoftMaxForwardViaDnnLibraryImpl(
|
||||
hipStream_t stream,
|
||||
miopenHandle_t miopenHandle,
|
||||
int element_count,
|
||||
int batch_count,
|
||||
|
|
@ -278,6 +281,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibraryImpl(
|
|||
|
||||
// invoke elementwise add with broadcast kernel
|
||||
::onnxruntime::rocm::BinaryElementWiseImpl(
|
||||
stream,
|
||||
(int32_t)X_shape.NumDimensions(),
|
||||
&lhs_padded_strides,
|
||||
X_data,
|
||||
|
|
@ -311,6 +315,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibraryImpl(
|
|||
|
||||
#define SPECIALIZED_BIAS_SOFTMAX_IMPL_VIA_DNN(T) \
|
||||
template void DispatchBiasSoftMaxForwardViaDnnLibraryImpl<T>( \
|
||||
hipStream_t stream, \
|
||||
miopenHandle_t miopenHandle, \
|
||||
int element_count, \
|
||||
int batch_count, \
|
||||
|
|
|
|||
|
|
@ -160,14 +160,16 @@ struct ProviderHostImpl : ProviderHost {
|
|||
return onnxruntime::make_unique<CUDAPinnedAllocator>(device_id, name);
|
||||
}
|
||||
|
||||
std::unique_ptr<IDataTransfer> CreateGPUDataTransfer() override { return onnxruntime::make_unique<GPUDataTransfer>(); }
|
||||
|
||||
void cuda__Impl_Cast(const int64_t* input_data, int32_t* output_data, size_t count) override {
|
||||
return cuda::Impl_Cast(input_data, output_data, count);
|
||||
std::unique_ptr<IDataTransfer> CreateGPUDataTransfer(void* stream) override {
|
||||
return onnxruntime::make_unique<GPUDataTransfer>(static_cast<cudaStream_t>(stream));
|
||||
}
|
||||
|
||||
void cuda__Impl_Cast(const int32_t* input_data, int64_t* output_data, size_t count) override {
|
||||
return cuda::Impl_Cast(input_data, output_data, count);
|
||||
void cuda__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) override {
|
||||
return cuda::Impl_Cast(static_cast<cudaStream_t>(stream), input_data, output_data, count);
|
||||
}
|
||||
|
||||
void cuda__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) override {
|
||||
return cuda::Impl_Cast(static_cast<cudaStream_t>(stream), input_data, output_data, count);
|
||||
}
|
||||
|
||||
bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return CudaCall<cudaError, false>(cudaError(retCode), exprString, libName, cudaError(successCode), msg); }
|
||||
|
|
@ -684,6 +686,13 @@ std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tensor
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* provider_options) {
|
||||
if (auto provider = s_library_tensorrt.Get())
|
||||
return provider->CreateExecutionProviderFactory(provider_options);
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_OpenVINO(const OrtOpenVINOProviderOptions* provider_options) {
|
||||
if (auto provider = s_library_openvino.Get())
|
||||
return provider->CreateExecutionProviderFactory(provider_options);
|
||||
|
|
@ -719,6 +728,16 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Tensorrt, _In_ OrtS
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT, _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options) {
|
||||
auto factory = onnxruntime::CreateExecutionProviderFactory_Tensorrt(tensorrt_options);
|
||||
if (!factory) {
|
||||
return OrtApis::CreateStatus(ORT_FAIL, "SessionOptionsAppendExecutionProvider_Tensorrt: Failed to load shared library");
|
||||
}
|
||||
|
||||
options->provider_factories.push_back(factory);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_OpenVINO, _In_ OrtSessionOptions* options, _In_ const OrtOpenVINOProviderOptions* provider_options) {
|
||||
auto factory = onnxruntime::CreateExecutionProviderFactory_OpenVINO(provider_options);
|
||||
if (!factory) {
|
||||
|
|
|
|||
|
|
@ -178,7 +178,8 @@ class LoopImpl {
|
|||
LoopImpl(OpKernelContextInternal& context,
|
||||
const SessionState& session_state,
|
||||
const Loop::Info& info,
|
||||
const Loop::ConcatOutput& concat_output_func);
|
||||
const Loop::ConcatOutput& concat_output_func,
|
||||
void* stream);
|
||||
|
||||
// Initialize by validating all the inputs, and allocating the output tensors
|
||||
Status Initialize();
|
||||
|
|
@ -211,9 +212,11 @@ class LoopImpl {
|
|||
std::vector<std::vector<OrtValue>> loop_output_tensors_;
|
||||
|
||||
const Loop::ConcatOutput& concat_output_func_;
|
||||
void* stream_;
|
||||
};
|
||||
|
||||
static Status ConcatenateCpuOutput(std::vector<OrtValue>& per_iteration_output,
|
||||
static Status ConcatenateCpuOutput(void* /*stream*/,
|
||||
std::vector<OrtValue>& per_iteration_output,
|
||||
void* output, size_t output_size_in_bytes) {
|
||||
const auto& first_output = per_iteration_output.front().Get<Tensor>();
|
||||
const auto& per_iteration_shape = first_output.Shape();
|
||||
|
|
@ -253,6 +256,7 @@ Loop::Loop(const OpKernelInfo& info) : IControlFlowKernel(info) {
|
|||
ORT_IGNORE_RETURN_VALUE(proto);
|
||||
|
||||
concat_output_func_ = ConcatenateCpuOutput;
|
||||
stream_ = nullptr;
|
||||
}
|
||||
|
||||
// we need this to be in the .cc so 'unique_ptr<Info> info_' can be handled
|
||||
|
|
@ -345,7 +349,7 @@ Status Loop::Compute(OpKernelContext* ctx) const {
|
|||
ORT_ENFORCE(session_state, "Subgraph SessionState was not found for 'body' attribute.");
|
||||
ORT_ENFORCE(feeds_fetches_manager_, "CreateFeedsFetchesManager must be called prior to execution of graph.");
|
||||
|
||||
LoopImpl loop_impl{*ctx_internal, *session_state, *info_, concat_output_func_};
|
||||
LoopImpl loop_impl{*ctx_internal, *session_state, *info_, concat_output_func_, stream_};
|
||||
|
||||
auto status = loop_impl.Initialize();
|
||||
ORT_RETURN_IF_ERROR(status);
|
||||
|
|
@ -358,12 +362,14 @@ Status Loop::Compute(OpKernelContext* ctx) const {
|
|||
LoopImpl::LoopImpl(OpKernelContextInternal& context,
|
||||
const SessionState& session_state,
|
||||
const Loop::Info& subgraph_info,
|
||||
const Loop::ConcatOutput& concat_output_func)
|
||||
const Loop::ConcatOutput& concat_output_func,
|
||||
void* stream)
|
||||
: context_(context),
|
||||
session_state_(session_state),
|
||||
info_(subgraph_info),
|
||||
implicit_inputs_(context_.GetImplicitInputs()),
|
||||
concat_output_func_(concat_output_func) {
|
||||
concat_output_func_(concat_output_func),
|
||||
stream_(stream) {
|
||||
auto* max_trip_count_tensor = context.Input<Tensor>(0);
|
||||
max_trip_count_ = max_trip_count_tensor ? *max_trip_count_tensor->Data<int64_t>() : INT64_MAX;
|
||||
|
||||
|
|
@ -457,7 +463,7 @@ Status LoopImpl::ConcatenateLoopOutput(std::vector<OrtValue>& per_iteration_outp
|
|||
TensorShape output_shape{dims};
|
||||
Tensor* output = context_.Output(output_index, output_shape);
|
||||
|
||||
ORT_RETURN_IF_ERROR(concat_output_func_(per_iteration_output, output->MutableDataRaw(), output->SizeInBytes()));
|
||||
ORT_RETURN_IF_ERROR(concat_output_func_(stream_, per_iteration_output, output->MutableDataRaw(), output->SizeInBytes()));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,17 +29,19 @@ class Loop : public controlflow::IControlFlowKernel {
|
|||
// function to concatenate the OrtValue instances from each Loop iteration into a single output buffer.
|
||||
// @param per_iteration_output OrtValue instances from each iteration. Never empty. All should have the same shape.
|
||||
// @param output Pre-allocated output buffer. On device specific to the ExecutionProvider running the Loop node.
|
||||
using ConcatOutput = std::function<Status(std::vector<OrtValue>& per_iteration_output,
|
||||
using ConcatOutput = std::function<Status(void* stream, std::vector<OrtValue>& per_iteration_output,
|
||||
void* output, size_t output_size_in_bytes)>;
|
||||
|
||||
protected:
|
||||
// derived class can provide implementation for handling concatenation of Loop output on a different device
|
||||
void SetConcatOutputFunc(const ConcatOutput& concat_output_func) { concat_output_func_ = concat_output_func; }
|
||||
void SetComputeStream(void* stream) { stream_ = stream; }
|
||||
|
||||
private:
|
||||
// Info and FeedsFetchesManager re-used for each subgraph execution.
|
||||
std::unique_ptr<Info> info_;
|
||||
std::unique_ptr<FeedsFetchesManager> feeds_fetches_manager_;
|
||||
ConcatOutput concat_output_func_;
|
||||
void* stream_;
|
||||
};
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ namespace DeviceHelpers {
|
|||
namespace CpuDeviceHelpers {
|
||||
|
||||
// CPU specific Data copy helper
|
||||
Status DataCopy(const Tensor& input, Tensor& output) {
|
||||
Status DataCopy(const Tensor& input, Tensor& output, void* /*einsum_cuda_assets*/) {
|
||||
ORT_ENFORCE(output.SizeInBytes() == input.SizeInBytes(),
|
||||
"Einsum op: The candidate output does not match the actual output's shape");
|
||||
// There are no string tensors in Einsum's case - so safely use memcpy
|
||||
|
|
@ -156,7 +156,7 @@ static std::unique_ptr<Tensor> DiagonalInnermostDims(const Tensor& input,
|
|||
return output;
|
||||
}
|
||||
|
||||
std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator) {
|
||||
std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator, void* /*einsum_cuda_assets*/) {
|
||||
const auto& input_shape = input.Shape();
|
||||
const auto& input_dims = input_shape.GetDims();
|
||||
auto rank = static_cast<int64_t>(input_dims.size());
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ namespace EinsumOp {
|
|||
namespace DeviceHelpers {
|
||||
|
||||
// Data copy op - Copies raw data from the source tensor's buffer to the destination tensor's buffer
|
||||
using DataCopy = std::function<Status(const Tensor& input, Tensor& output)>;
|
||||
using DataCopy = std::function<Status(const Tensor& input, Tensor& output, void* einsum_cuda_assets)>;
|
||||
|
||||
// Transpose op - Transposes given input based on data in `permutation`
|
||||
using Transpose = std::function<Status(const std::vector<size_t>& permutation, const Tensor& input,
|
||||
|
|
@ -54,12 +54,12 @@ using ReduceSum = std::function<Tensor(const Tensor& input, const std::vector<in
|
|||
// Eg. input_shape = [2, 3, 5, 3] and dim_1 = 1 and dim_2 = 3
|
||||
// The output_shape will be [2, 3, 5] and dim_1 will contain the diagonal elements
|
||||
using Diagonal = std::function<std::unique_ptr<Tensor>(const Tensor& input, int64_t dim_1, int64_t dim_2,
|
||||
AllocatorPtr allocator)>;
|
||||
AllocatorPtr allocator, void* einsum_cuda_assets)>;
|
||||
|
||||
// These are CPU specific device helper implementations
|
||||
namespace CpuDeviceHelpers {
|
||||
|
||||
Status DataCopy(const Tensor& input, Tensor& output);
|
||||
Status DataCopy(const Tensor& input, Tensor& output, void* einsum_cuda_assets);
|
||||
|
||||
Status Transpose(const std::vector<size_t>& permutation, const Tensor& input,
|
||||
Tensor& output, const TensorShape* input_shape_override, void* einsum_cuda_assets);
|
||||
|
|
@ -76,7 +76,7 @@ Tensor ReduceSum(const Tensor& input, const std::vector<int64_t>& reduce_axes,
|
|||
const TensorShape* input_shape_override,
|
||||
concurrency::ThreadPool* tp, void* einsum_cuda_assets);
|
||||
|
||||
std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator);
|
||||
std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator, void* einsum_cuda_assets);
|
||||
|
||||
} // namespace CpuDeviceHelpers
|
||||
|
||||
|
|
|
|||
|
|
@ -440,7 +440,7 @@ Status EinsumComputePreprocessor::PreprocessInputs() {
|
|||
preprocessed = device_diagonal_func_(preprocessed ? *preprocessed : *inputs_[input_iter],
|
||||
subscript_indices_to_input_index[subscript_index],
|
||||
dim_index_in_preprocessed_input,
|
||||
allocator_);
|
||||
allocator_, einsum_ep_assets_);
|
||||
}
|
||||
++dim_index_in_original_input;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -58,13 +58,13 @@ void EinsumTypedComputeProcessor<T>::FinalizeOutput(const Tensor& candidate_outp
|
|||
// into the buffer of the actual output given to us by the execution frame
|
||||
// We need to do this because the buffer owned by the output tensor of the op could be user provided buffer
|
||||
|
||||
auto status = device_data_copy_func_(*candidate_output_transposed, output);
|
||||
auto status = device_data_copy_func_(*candidate_output_transposed, output, einsum_ep_assets_);
|
||||
ORT_ENFORCE(status.IsOK(), "Einsum op: Could not copy the intermediate output's buffer into the op's output buffer. Error: ",
|
||||
status.ErrorMessage());
|
||||
|
||||
} else {
|
||||
// Copy the output candidate into the op's output
|
||||
auto status = device_data_copy_func_(candidate_output, output);
|
||||
auto status = device_data_copy_func_(candidate_output, output, einsum_ep_assets_);
|
||||
ORT_ENFORCE(status.IsOK(), "Einsum op: Could not copy the intermediate output's buffer into the op's output buffer. Error: ",
|
||||
status.ErrorMessage());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@ namespace cuda {
|
|||
ORT_RETURN_IF_ERROR(UnaryElementwise::Prepare(context, &p)); \
|
||||
Ctx##x func_ctx = MakeFuncCtx(); \
|
||||
Impl_##x<typename ToCudaType<T>::MappedType>( \
|
||||
Stream(), \
|
||||
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(p.input_tensor->template Data<T>()), \
|
||||
reinterpret_cast<typename ToCudaType<T>::MappedType*>(p.output_tensor->template MutableData<T>()), \
|
||||
&func_ctx, p.output_tensor->Shape().Size()); \
|
||||
|
|
|
|||
|
|
@ -84,14 +84,15 @@ struct OP_ThresholdedRelu : public CtxThresholdedRelu {
|
|||
|
||||
#define UNARY_ACTIVATION_IMPL(name) \
|
||||
UNARY_ACTIVATION_IMPL_DECLARATION(name) { \
|
||||
UnaryElementWiseImpl(input_data, \
|
||||
UnaryElementWiseImpl(stream, \
|
||||
input_data, \
|
||||
output_data, \
|
||||
*reinterpret_cast<const OP_##name<T>*>(func_ctx), \
|
||||
count); \
|
||||
}
|
||||
|
||||
#define SPECIALIZED_UNARY_ACTIVATION_IMPL(name, T) \
|
||||
template void Impl_##name<T>(const T* input_data, T* output_data, const Ctx##name* func_ctx, size_t count);
|
||||
template void Impl_##name<T>(cudaStream_t stream, const T* input_data, T* output_data, const Ctx##name* func_ctx, size_t count);
|
||||
|
||||
#if CUDA_VERSION >= 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
|
||||
#define SPECIALIZED_UNARY_ACTIVATION_IMPL_BF16(name) SPECIALIZED_UNARY_ACTIVATION_IMPL(name, nv_bfloat16)
|
||||
|
|
|
|||
|
|
@ -48,6 +48,7 @@ typedef CtxAlpha CtxThresholdedRelu;
|
|||
#define UNARY_ACTIVATION_IMPL_DECLARATION(name) \
|
||||
template <typename T> \
|
||||
void Impl_##name( \
|
||||
cudaStream_t stream, \
|
||||
const T* input_data, \
|
||||
T* output_data, \
|
||||
const Ctx##name* func_ctx, \
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ ONNX_OPERATOR_KERNEL_EX(Loop,
|
|||
.TypeConstraint("V", DataTypeImpl::AllFixedSizeTensorTypes()),
|
||||
Loop);
|
||||
|
||||
static Status ConcatenateGpuOutput(std::vector<OrtValue>& per_iteration_output,
|
||||
static Status ConcatenateGpuOutput(void* stream, std::vector<OrtValue>& per_iteration_output,
|
||||
void* output, ptrdiff_t output_size_in_bytes) {
|
||||
const auto& first_output = per_iteration_output.front().Get<Tensor>();
|
||||
const auto& per_iteration_shape = first_output.Shape();
|
||||
|
|
@ -68,8 +68,8 @@ static Status ConcatenateGpuOutput(std::vector<OrtValue>& per_iteration_output,
|
|||
" Expected:", per_iteration_shape, " Got:", iteration_data.Shape());
|
||||
}
|
||||
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpy(cur_output, iteration_data.DataRaw(), bytes_per_iteration,
|
||||
cudaMemcpyDeviceToDevice));
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cur_output, iteration_data.DataRaw(), bytes_per_iteration,
|
||||
cudaMemcpyDeviceToDevice, static_cast<cudaStream_t>(stream)));
|
||||
|
||||
cur_output = static_cast<void*>((static_cast<gsl::byte*>(cur_output) + bytes_per_iteration));
|
||||
}
|
||||
|
|
@ -82,6 +82,7 @@ static Status ConcatenateGpuOutput(std::vector<OrtValue>& per_iteration_output,
|
|||
|
||||
Loop::Loop(const OpKernelInfo& info) : onnxruntime::Loop(info) {
|
||||
SetConcatOutputFunc(ConcatenateGpuOutput);
|
||||
SetComputeStream(static_cast<void*>(info.GetExecutionProvider()->GetComputeStream()));
|
||||
}
|
||||
|
||||
Status Loop::Compute(OpKernelContext* ctx) const {
|
||||
|
|
|
|||
|
|
@ -180,6 +180,7 @@ __global__ void _BinaryElementWiseRhsPerChannelBatchN(
|
|||
|
||||
template <typename T, typename T1, typename T2, typename FuncT>
|
||||
void BinaryElementWiseNoBroadcastImpl(
|
||||
cudaStream_t stream,
|
||||
const T1* lhs_data,
|
||||
const T2* rhs_data,
|
||||
T* output_data,
|
||||
|
|
@ -190,7 +191,7 @@ void BinaryElementWiseNoBroadcastImpl(
|
|||
|
||||
int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
|
||||
CUDA_LONG N = static_cast<CUDA_LONG>(count);
|
||||
_BinaryElementWiseSimple<true, true, T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
_BinaryElementWiseSimple<true, true, T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
lhs_data,
|
||||
rhs_data,
|
||||
output_data,
|
||||
|
|
@ -200,6 +201,7 @@ void BinaryElementWiseNoBroadcastImpl(
|
|||
|
||||
template <typename T, typename T1, typename T2, typename FuncT>
|
||||
void BinaryElementWiseImpl(
|
||||
cudaStream_t stream,
|
||||
int32_t output_rank_or_simple_broadcast,
|
||||
const TArray<int64_t>* lhs_padded_strides,
|
||||
const T1* lhs_data,
|
||||
|
|
@ -217,14 +219,14 @@ void BinaryElementWiseImpl(
|
|||
int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
|
||||
CUDA_LONG N = static_cast<CUDA_LONG>(count);
|
||||
if (output_rank_or_simple_broadcast == static_cast<int32_t>(SimpleBroadcast::NoBroadcast)) {
|
||||
_BinaryElementWiseSimple<true, true, T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
_BinaryElementWiseSimple<true, true, T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
lhs_data,
|
||||
rhs_data,
|
||||
output_data,
|
||||
func,
|
||||
N);
|
||||
} else if (output_rank_or_simple_broadcast == static_cast<int32_t>(SimpleBroadcast::LeftScalar)) {
|
||||
_BinaryElementWiseSimple<false, true, T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
_BinaryElementWiseSimple<false, true, T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
lhs_data,
|
||||
rhs_data,
|
||||
output_data,
|
||||
|
|
@ -232,14 +234,14 @@ void BinaryElementWiseImpl(
|
|||
N);
|
||||
} else if (output_rank_or_simple_broadcast == static_cast<int32_t>(SimpleBroadcast::RightScalar)) {
|
||||
_BinaryElementWiseSimple<true, false, T, T1, T2, FuncT, GridDim::maxThreadsPerBlock,
|
||||
GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
lhs_data,
|
||||
rhs_data,
|
||||
output_data,
|
||||
func,
|
||||
N);
|
||||
} else if (output_rank_or_simple_broadcast == static_cast<int32_t>(SimpleBroadcast::RightPerChannelBatch1)) {
|
||||
_BinaryElementWiseRhsPerChannelBatch1<T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
_BinaryElementWiseRhsPerChannelBatch1<T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
lhs_data,
|
||||
rhs_data,
|
||||
fdm_H,
|
||||
|
|
@ -247,7 +249,7 @@ void BinaryElementWiseImpl(
|
|||
func,
|
||||
N);
|
||||
} else if (output_rank_or_simple_broadcast == static_cast<int32_t>(SimpleBroadcast::RightPerChannelBatchN)) {
|
||||
_BinaryElementWiseRhsPerChannelBatchN<T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
_BinaryElementWiseRhsPerChannelBatchN<T, T1, T2, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
lhs_data,
|
||||
rhs_data,
|
||||
fdm_H,
|
||||
|
|
@ -257,7 +259,7 @@ void BinaryElementWiseImpl(
|
|||
N);
|
||||
} else {
|
||||
if (lhs_padded_strides && rhs_padded_strides && lhs_padded_strides->Size() && rhs_padded_strides->Size())
|
||||
_BinaryElementWise<T, T1, T2, FuncT, true, true, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
_BinaryElementWise<T, T1, T2, FuncT, true, true, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
output_rank_or_simple_broadcast,
|
||||
*lhs_padded_strides,
|
||||
lhs_data,
|
||||
|
|
@ -268,7 +270,7 @@ void BinaryElementWiseImpl(
|
|||
func,
|
||||
N);
|
||||
else if (lhs_padded_strides && lhs_padded_strides->Size())
|
||||
_BinaryElementWise<T, T1, T2, FuncT, true, false, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
_BinaryElementWise<T, T1, T2, FuncT, true, false, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
output_rank_or_simple_broadcast,
|
||||
*lhs_padded_strides,
|
||||
lhs_data,
|
||||
|
|
@ -279,7 +281,7 @@ void BinaryElementWiseImpl(
|
|||
func,
|
||||
N);
|
||||
else if (rhs_padded_strides && rhs_padded_strides->Size())
|
||||
_BinaryElementWise<T, T1, T2, FuncT, false, true, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
_BinaryElementWise<T, T1, T2, FuncT, false, true, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
output_rank_or_simple_broadcast,
|
||||
TArray<int64_t>(), // lhs is not computed, so no need to deference lhs_padded_strides
|
||||
lhs_data,
|
||||
|
|
|
|||
|
|
@ -39,6 +39,7 @@ __global__ void _UnaryElementWise(
|
|||
|
||||
template <typename InT, typename OutT, typename FuncT>
|
||||
void UnaryElementWiseImpl(
|
||||
cudaStream_t stream,
|
||||
const InT* input_data,
|
||||
OutT* output_data,
|
||||
const FuncT& func,
|
||||
|
|
@ -49,7 +50,7 @@ void UnaryElementWiseImpl(
|
|||
int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
|
||||
CUDA_LONG N = static_cast<CUDA_LONG>(count);
|
||||
_UnaryElementWise<InT, OutT, FuncT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
|
||||
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
input_data,
|
||||
output_data,
|
||||
func,
|
||||
|
|
|
|||
|
|
@ -62,6 +62,7 @@ __global__ void VariadicElementWiseNoBroadcastInputBatchKernel(
|
|||
// - inputs and output have N elements
|
||||
template <typename T, typename Func, int32_t max_input_batch_size>
|
||||
void VariadicElementWiseNoBroadcastInputBatchImpl(
|
||||
cudaStream_t stream,
|
||||
Func func,
|
||||
size_t N,
|
||||
TArray<const T*, max_input_batch_size> inputs,
|
||||
|
|
@ -70,7 +71,7 @@ void VariadicElementWiseNoBroadcastInputBatchImpl(
|
|||
constexpr int32_t threads_per_block = GridDim::maxThreadsPerBlock;
|
||||
const int32_t blocks_per_grid = static_cast<int32_t>(CeilDiv(N, elements_per_thread * threads_per_block));
|
||||
VariadicElementWiseNoBroadcastInputBatchKernel<T, Func, max_input_batch_size, elements_per_thread>
|
||||
<<<blocks_per_grid, threads_per_block>>>(func, N, inputs, output);
|
||||
<<<blocks_per_grid, threads_per_block, 0, stream>>>(func, N, inputs, output);
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
|
|
|
|||
|
|
@ -59,10 +59,15 @@ ONNX_OPERATOR_KERNEL_EX(
|
|||
|
||||
} // namespace cuda
|
||||
|
||||
CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy) {
|
||||
CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, cudaStream_t stream, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy) {
|
||||
CUDA_CALL_THROW(cudaSetDevice(device_id));
|
||||
stream_ = stream;
|
||||
|
||||
CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_));
|
||||
CUBLAS_CALL_THROW(cublasSetStream(cublas_handle_, stream));
|
||||
|
||||
CUDNN_CALL_THROW(cudnnCreate(&cudnn_handle_));
|
||||
CUDNN_CALL_THROW(cudnnSetStream(cudnn_handle_, stream));
|
||||
|
||||
AllocatorCreationInfo default_memory_info(
|
||||
[](OrtDevice::DeviceId id) {
|
||||
|
|
@ -103,6 +108,12 @@ CUDAExecutionProvider::CUDAExecutionProvider(const CUDAExecutionProviderInfo& in
|
|||
// must wait GPU idle, otherwise cudaGetDeviceProperties might fail
|
||||
CUDA_CALL_THROW(cudaDeviceSynchronize());
|
||||
CUDA_CALL_THROW(cudaGetDeviceProperties(&device_prop_, info_.device_id));
|
||||
if (info.has_user_compute_stream) {
|
||||
external_stream_ = true;
|
||||
stream_ = static_cast<cudaStream_t>(info.user_compute_stream);
|
||||
} else {
|
||||
CUDA_CALL_THROW(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
|
||||
}
|
||||
|
||||
size_t free = 0;
|
||||
size_t total = 0;
|
||||
|
|
@ -136,6 +147,10 @@ CUDAExecutionProvider::~CUDAExecutionProvider() {
|
|||
ORT_IGNORE_RETURN_VALUE(cache->erase(this));
|
||||
}
|
||||
}
|
||||
|
||||
if (!external_stream_ && stream_) {
|
||||
CUDA_CALL(cudaStreamDestroy(stream_));
|
||||
}
|
||||
}
|
||||
|
||||
CUDAExecutionProvider::PerThreadContext& CUDAExecutionProvider::GetPerThreadContext() const {
|
||||
|
|
@ -156,7 +171,7 @@ CUDAExecutionProvider::PerThreadContext& CUDAExecutionProvider::GetPerThreadCont
|
|||
|
||||
// get or create a context
|
||||
if (context_state_.retired_context_pool.empty()) {
|
||||
context = std::make_shared<PerThreadContext>(info_.device_id, info_.cuda_mem_limit, info_.arena_extend_strategy);
|
||||
context = std::make_shared<PerThreadContext>(info_.device_id, static_cast<cudaStream_t>(GetComputeStream()), info_.cuda_mem_limit, info_.arena_extend_strategy);
|
||||
} else {
|
||||
context = context_state_.retired_context_pool.back();
|
||||
context_state_.retired_context_pool.pop_back();
|
||||
|
|
@ -254,10 +269,24 @@ Status CUDAExecutionProvider::OnRunStart() {
|
|||
Status CUDAExecutionProvider::OnRunEnd() {
|
||||
// record deferred release event on default stream, and release per_thread_context
|
||||
auto current_deferred_release_event = GetPerThreadContext().GetCurrentDeferredReleaseEvent();
|
||||
CUDA_RETURN_IF_ERROR(cudaEventRecord(current_deferred_release_event, nullptr));
|
||||
CUDA_RETURN_IF_ERROR(cudaEventRecord(current_deferred_release_event, static_cast<cudaStream_t>(GetComputeStream())));
|
||||
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(static_cast<cudaStream_t>(GetComputeStream())));
|
||||
ReleasePerThreadContext();
|
||||
std::lock_guard<OrtMutex> lock(deferred_release_cpu_ptr_mutex_);
|
||||
deferred_release_cpu_ptr_[current_deferred_release_event].recorded = true;
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status CUDAExecutionProvider::SetComputeStream(void* stream) {
|
||||
if (stream != stream_) {
|
||||
if (stream_) {
|
||||
CUDA_RETURN_IF_ERROR(cudaStreamDestroy(stream_));
|
||||
}
|
||||
|
||||
external_stream_ = true;
|
||||
stream_ = static_cast<cudaStream_t>(stream);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
|
@ -1878,7 +1907,7 @@ static bool CastNeedFallbackToCPU(const onnxruntime::Node& node) {
|
|||
}
|
||||
|
||||
std::unique_ptr<onnxruntime::IDataTransfer> CUDAExecutionProvider::GetDataTransfer() const {
|
||||
return onnxruntime::make_unique<onnxruntime::GPUDataTransfer>(info_.do_copy_in_default_stream);
|
||||
return onnxruntime::make_unique<onnxruntime::GPUDataTransfer>(static_cast<cudaStream_t>(GetComputeStream()), info_.do_copy_in_default_stream);
|
||||
}
|
||||
|
||||
std::vector<std::unique_ptr<ComputeCapability>>
|
||||
|
|
|
|||
|
|
@ -13,8 +13,8 @@
|
|||
#include "core/platform/ort_mutex.h"
|
||||
#include "core/providers/cuda/cuda_execution_provider_info.h"
|
||||
#include "core/providers/cuda/cuda_pch.h"
|
||||
#include "core/providers/cuda/gpu_data_transfer.h"
|
||||
#include "core/providers/cuda/shared_inc/cuda_utils.h"
|
||||
#include "core/providers/cuda/shared_inc/cuda_call.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
|
||||
|
|
@ -37,6 +37,10 @@ class CUDAExecutionProvider : public IExecutionProvider {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
Status SetComputeStream(void* stream) override;
|
||||
|
||||
void* GetComputeStream() const override { return static_cast<void*>(stream_); }
|
||||
|
||||
cublasHandle_t PerThreadCublasHandle() {
|
||||
return GetPerThreadContext().CublasHandle();
|
||||
}
|
||||
|
|
@ -80,6 +84,8 @@ class CUDAExecutionProvider : public IExecutionProvider {
|
|||
private:
|
||||
CUDAExecutionProviderInfo info_;
|
||||
cudaDeviceProp device_prop_;
|
||||
bool external_stream_ = false;
|
||||
cudaStream_t stream_ = nullptr;
|
||||
struct DeferredReleaseCPUPtrs {
|
||||
bool recorded = false;
|
||||
std::vector<void*> cpu_ptrs;
|
||||
|
|
@ -90,7 +96,7 @@ class CUDAExecutionProvider : public IExecutionProvider {
|
|||
|
||||
class PerThreadContext final {
|
||||
public:
|
||||
PerThreadContext(OrtDevice::DeviceId device_id, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy);
|
||||
PerThreadContext(OrtDevice::DeviceId device_id, cudaStream_t stream, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy);
|
||||
~PerThreadContext();
|
||||
|
||||
cublasHandle_t CublasHandle() const {
|
||||
|
|
@ -111,23 +117,23 @@ class CUDAExecutionProvider : public IExecutionProvider {
|
|||
if (!constant_ones_float_) {
|
||||
constant_ones_float_ = cuda::CreateConstantOnes<float>();
|
||||
}
|
||||
return reinterpret_cast<const T*>(constant_ones_float_->GetBuffer(count));
|
||||
return reinterpret_cast<const T*>(constant_ones_float_->GetBuffer(stream_, count));
|
||||
} else if (std::is_same<T, double>::value) {
|
||||
if (!constant_ones_double_) {
|
||||
constant_ones_double_ = cuda::CreateConstantOnes<double>();
|
||||
}
|
||||
return reinterpret_cast<const T*>(constant_ones_double_->GetBuffer(count));
|
||||
return reinterpret_cast<const T*>(constant_ones_double_->GetBuffer(stream_, count));
|
||||
} else if (std::is_same<T, half>::value) {
|
||||
if (!constant_ones_half_) {
|
||||
constant_ones_half_ = cuda::CreateConstantOnes<half>();
|
||||
}
|
||||
return reinterpret_cast<const T*>(constant_ones_half_->GetBuffer(count));
|
||||
return reinterpret_cast<const T*>(constant_ones_half_->GetBuffer(stream_, count));
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
|
||||
} else if (std::is_same<T, nv_bfloat16>::value) {
|
||||
if (!constant_ones_bfloat16_) {
|
||||
constant_ones_bfloat16_ = cuda::CreateConstantOnes<nv_bfloat16>();
|
||||
}
|
||||
return reinterpret_cast<const T*>(constant_ones_bfloat16_->GetBuffer(count));
|
||||
return reinterpret_cast<const T*>(constant_ones_bfloat16_->GetBuffer(stream_, count));
|
||||
#endif
|
||||
} else {
|
||||
return nullptr;
|
||||
|
|
@ -139,6 +145,7 @@ class CUDAExecutionProvider : public IExecutionProvider {
|
|||
}
|
||||
|
||||
private:
|
||||
cudaStream_t stream_ = nullptr;
|
||||
cublasHandle_t cublas_handle_ = nullptr;
|
||||
cudnnHandle_t cudnn_handle_ = nullptr;
|
||||
|
||||
|
|
|
|||
|
|
@ -18,6 +18,8 @@ struct CUDAExecutionProviderInfo {
|
|||
ArenaExtendStrategy arena_extend_strategy{ArenaExtendStrategy::kNextPowerOfTwo};
|
||||
OrtCudnnConvAlgoSearch cudnn_conv_algo_search{OrtCudnnConvAlgoSearch::EXHAUSTIVE};
|
||||
bool do_copy_in_default_stream{true};
|
||||
bool has_user_compute_stream{false};
|
||||
void* user_compute_stream{nullptr};
|
||||
|
||||
static CUDAExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
|
||||
static ProviderOptions ToProviderOptions(const CUDAExecutionProviderInfo& info);
|
||||
|
|
|
|||
|
|
@ -59,7 +59,9 @@ class CudaKernel : public OpKernel {
|
|||
provider_->AddDeferredReleaseCPUPtr(p);
|
||||
}
|
||||
|
||||
const cudaDeviceProp& GetDeviceProp() const { return provider_->GetDeviceProp(); };
|
||||
const cudaDeviceProp& GetDeviceProp() const { return provider_->GetDeviceProp(); }
|
||||
|
||||
inline cudaStream_t Stream() const { return static_cast<cudaStream_t>(provider_->GetComputeStream()); }
|
||||
|
||||
// To support cudaMemcpyAsync, the cpu memory should be allocated in pinned memory
|
||||
// and it can only be released after the copy has finished
|
||||
|
|
@ -94,7 +96,7 @@ class CudaKernel : public OpKernel {
|
|||
Status CopyToGpu() {
|
||||
if (cpu_pinned_copy_) {
|
||||
gpu_copy_ = op_kernel_->GetScratchBuffer<T>(count_);
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(gpu_copy_.get(), cpu_pinned_copy_.get(), count_ * sizeof(T), cudaMemcpyHostToDevice));
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(gpu_copy_.get(), cpu_pinned_copy_.get(), count_ * sizeof(T), cudaMemcpyHostToDevice, op_kernel_->Stream()));
|
||||
op_kernel_->AddDeferredReleaseCPUPtr(cpu_pinned_copy_.release());
|
||||
}
|
||||
return Status::OK();
|
||||
|
|
|
|||
|
|
@ -57,7 +57,8 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_CUDA,
|
|||
info.arena_extend_strategy = static_cast<onnxruntime::ArenaExtendStrategy>(cuda_options->arena_extend_strategy);
|
||||
info.cudnn_conv_algo_search = cuda_options->cudnn_conv_algo_search;
|
||||
info.do_copy_in_default_stream = cuda_options->do_copy_in_default_stream;
|
||||
|
||||
info.has_user_compute_stream = cuda_options->has_user_compute_stream;
|
||||
info.user_compute_stream = cuda_options->user_compute_stream;
|
||||
options->provider_factories.push_back(onnxruntime::CreateExecutionProviderFactory_CUDA(info));
|
||||
|
||||
return nullptr;
|
||||
|
|
|
|||
|
|
@ -27,11 +27,11 @@ __global__ void _Fill(
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
void Fill(T* output, T value, int64_t count) {
|
||||
void Fill(cudaStream_t stream, T* output, T value, int64_t count) {
|
||||
int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
|
||||
CUDA_LONG N = static_cast<CUDA_LONG>(count);
|
||||
_Fill<T, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
|
||||
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(output, value, N);
|
||||
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(output, value, N);
|
||||
}
|
||||
template <typename T>
|
||||
class ConstantBufferImpl : public IConstantBuffer<T> {
|
||||
|
|
@ -43,7 +43,7 @@ class ConstantBufferImpl : public IConstantBuffer<T> {
|
|||
cudaFree(buffer_);
|
||||
}
|
||||
|
||||
virtual const T* GetBuffer(size_t count) {
|
||||
virtual const T* GetBuffer(cudaStream_t stream, size_t count) {
|
||||
if (count > count_) {
|
||||
if (buffer_) {
|
||||
cudaFree(buffer_);
|
||||
|
|
@ -52,7 +52,7 @@ class ConstantBufferImpl : public IConstantBuffer<T> {
|
|||
CUDA_CALL_THROW(cudaMalloc(&buffer_, count * sizeof(T)));
|
||||
count_ = count;
|
||||
|
||||
Fill(buffer_, val_, count);
|
||||
Fill(stream, buffer_, val_, count);
|
||||
}
|
||||
return buffer_;
|
||||
}
|
||||
|
|
@ -76,7 +76,7 @@ template std::unique_ptr<IConstantBuffer<nv_bfloat16>> CreateConstantOnes<nv_bfl
|
|||
#endif
|
||||
|
||||
#define SPECIALIZED_FILL(T) \
|
||||
template void Fill<T>(T * output, T value, int64_t count);
|
||||
template void Fill<T>(cudaStream_t stream, T * output, T value, int64_t count);
|
||||
|
||||
SPECIALIZED_FILL(int8_t)
|
||||
SPECIALIZED_FILL(int16_t)
|
||||
|
|
|
|||
|
|
@ -65,30 +65,30 @@ __global__ void CopyVectorBFloat16(const nv_bfloat16* x, int incx, nv_bfloat16*
|
|||
|
||||
} // namespace
|
||||
|
||||
cublasStatus_t cublasTransposeHelper(cublasHandle_t, cublasOperation_t, cublasOperation_t, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int) {
|
||||
cublasStatus_t cublasTransposeHelper(cudaStream_t stream, cublasHandle_t, cublasOperation_t, cublasOperation_t, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int) {
|
||||
if (C != A) {
|
||||
dim3 dimGrid((n + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, (m + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, 1);
|
||||
dim3 dimBlock(TRANS_TILE_DIM, BLOCK_ROWS, 1);
|
||||
|
||||
transposeNoOverlap<<<dimGrid, dimBlock>>>(C, A, n, m);
|
||||
transposeNoOverlap<<<dimGrid, dimBlock, 0, stream>>>(C, A, n, m);
|
||||
} else {
|
||||
return CUBLAS_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
return CUBLAS_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
cublasStatus_t cublasCopyHelper(cublasHandle_t, int n, const half* x, int incx, half* y, int incy) {
|
||||
cublasStatus_t cublasCopyHelper(cudaStream_t stream, cublasHandle_t, int n, const half* x, int incx, half* y, int incy) {
|
||||
dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1);
|
||||
dim3 dimBlock(COPY_BLOCK_DIM, 1, 1);
|
||||
CopyVectorHalf<<<dimGrid, dimBlock>>>(x, incx, y, incy, n);
|
||||
CopyVectorHalf<<<dimGrid, dimBlock, 0, stream>>>(x, incx, y, incy, n);
|
||||
return CUBLAS_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
#if CUDA_VERSION >= 11000
|
||||
cublasStatus_t cublasCopyHelper(cublasHandle_t, int n, const nv_bfloat16* x, int incx, nv_bfloat16* y, int incy) {
|
||||
cublasStatus_t cublasCopyHelper(cudaStream_t stream, cublasHandle_t, int n, const nv_bfloat16* x, int incx, nv_bfloat16* y, int incy) {
|
||||
dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1);
|
||||
dim3 dimBlock(COPY_BLOCK_DIM, 1, 1);
|
||||
CopyVectorBFloat16<<<dimGrid, dimBlock>>>(x, incx, y, incy, n);
|
||||
CopyVectorBFloat16<<<dimGrid, dimBlock, 0, stream>>>(x, incx, y, incy, n);
|
||||
return CUBLAS_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ Status ConstantOfShape::ComputeInternal(OpKernelContext* ctx) const {
|
|||
#define CASE(TYPE) \
|
||||
case sizeof(TYPE): \
|
||||
if (size > 0) { \
|
||||
cuda::Fill(reinterpret_cast<TYPE*>(output_data), *(reinterpret_cast<const TYPE*>(value_ptr)), size); \
|
||||
cuda::Fill(Stream(), reinterpret_cast<TYPE*>(output_data), *(reinterpret_cast<const TYPE*>(value_ptr)), size); \
|
||||
} \
|
||||
break;
|
||||
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ ONNX_OPERATOR_KERNEL_EX(
|
|||
Range);
|
||||
|
||||
template <typename T>
|
||||
static Status ComputeRange(OpKernelContext* ctx) {
|
||||
static Status ComputeRange(cudaStream_t stream, OpKernelContext* ctx) {
|
||||
const auto& start_tensor = *ctx->Input<Tensor>(0);
|
||||
const auto& limit_tensor = *ctx->Input<Tensor>(1);
|
||||
const auto* delta_tensor_ptr = ctx->Input<Tensor>(2);
|
||||
|
|
@ -71,7 +71,7 @@ static Status ComputeRange(OpKernelContext* ctx) {
|
|||
T* y = ctx->Output(0, shape)->template MutableData<T>();
|
||||
|
||||
if (count > 0) {
|
||||
if (!RangeImpl(start, delta, count, y)) {
|
||||
if (!RangeImpl(stream, start, delta, count, y)) {
|
||||
CUDA_CALL(cudaGetLastError());
|
||||
return Status(common::ONNXRUNTIME, common::FAIL);
|
||||
}
|
||||
|
|
@ -84,8 +84,8 @@ namespace cuda_range_internal {
|
|||
|
||||
template <class T>
|
||||
struct CallCudaRangeImpl {
|
||||
Status operator()(OpKernelContext* ctx) const {
|
||||
return ComputeRange<T>(ctx);
|
||||
Status operator()(cudaStream_t stream, OpKernelContext* ctx) const {
|
||||
return ComputeRange<T>(stream, ctx);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -100,7 +100,7 @@ Status Range::ComputeInternal(OpKernelContext* ctx) const {
|
|||
utils::MLTypeCallDispatcherRet<Status, cuda_range_internal::CallCudaRangeImpl, int32_t,
|
||||
float, int64_t, double, int16_t>
|
||||
t_disp(input_tensor->GetElementType());
|
||||
return t_disp.Invoke(ctx);
|
||||
return t_disp.Invoke(Stream(), ctx);
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
|
|
|
|||
|
|
@ -22,15 +22,15 @@ __global__ void RangeKernel(const T start, const T delta, const int count, T* ou
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
bool RangeImpl(const T start, const T delta, const int count, T* output) {
|
||||
bool RangeImpl(cudaStream_t stream, const T start, const T delta, const int count, T* output) {
|
||||
constexpr int block_size = 256;
|
||||
int grid_size = (count + block_size - 1) / block_size;
|
||||
RangeKernel<T><<<grid_size, block_size, 0>>>(start, delta, count, output);
|
||||
RangeKernel<T><<<grid_size, block_size, 0, stream>>>(start, delta, count, output);
|
||||
return CUDA_CALL(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
#define SPECIALIZED_IMPL(T) \
|
||||
template bool RangeImpl<T>(const T start, const T delta, const int count, T* output);
|
||||
template bool RangeImpl<T>(cudaStream_t stream, const T start, const T delta, const int count, T* output);
|
||||
|
||||
SPECIALIZED_IMPL(int16_t)
|
||||
SPECIALIZED_IMPL(int32_t)
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ namespace cuda {
|
|||
|
||||
|
||||
template <typename T>
|
||||
bool RangeImpl(const T start, const T delta, const int count, T* output);
|
||||
bool RangeImpl(cudaStream_t stream, const T start, const T delta, const int count, T* output);
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -9,12 +9,13 @@
|
|||
// so we leave it as optional, in case user need the previous behavior
|
||||
// a full fix to BFC arena is being looked at, and once it's in, we can revert this change
|
||||
namespace onnxruntime {
|
||||
GPUDataTransfer::GPUDataTransfer(bool do_copy_in_default_stream) {
|
||||
GPUDataTransfer::GPUDataTransfer(cudaStream_t stream, bool do_copy_in_default_stream) {
|
||||
// create streams, default is nullptr
|
||||
streams_[kCudaStreamDefault] = nullptr;
|
||||
do_copy_in_default_stream_ = do_copy_in_default_stream;
|
||||
streams_[kCudaStreamDefault] = stream;
|
||||
if (do_copy_in_default_stream) {
|
||||
streams_[kCudaStreamCopyIn] = nullptr;
|
||||
streams_[kCudaStreamCopyOut] = nullptr;
|
||||
streams_[kCudaStreamCopyIn] = stream;
|
||||
streams_[kCudaStreamCopyOut] = stream;
|
||||
} else {
|
||||
CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyIn], cudaStreamNonBlocking));
|
||||
CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyOut], cudaStreamNonBlocking));
|
||||
|
|
@ -22,10 +23,10 @@ GPUDataTransfer::GPUDataTransfer(bool do_copy_in_default_stream) {
|
|||
}
|
||||
|
||||
GPUDataTransfer::~GPUDataTransfer() {
|
||||
if (streams_[kCudaStreamCopyIn] != nullptr) {
|
||||
if (!do_copy_in_default_stream_ && streams_[kCudaStreamCopyIn] != nullptr) {
|
||||
CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyIn]));
|
||||
}
|
||||
if (streams_[kCudaStreamCopyOut] != nullptr) {
|
||||
if (!do_copy_in_default_stream_ && streams_[kCudaStreamCopyOut] != nullptr) {
|
||||
CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyOut]));
|
||||
}
|
||||
}
|
||||
|
|
@ -46,24 +47,26 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int e
|
|||
if (dst_device.Type() == OrtDevice::GPU) {
|
||||
if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
|
||||
// copy from pinned memory to GPU, this is non-blocking
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, streams_[exec_queue_id]));
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, GetStream(exec_queue_id)));
|
||||
} else if (src_device.Type() == OrtDevice::GPU) {
|
||||
// copying between GPU, this is non-blocking
|
||||
// Copy only if the two addresses are different.
|
||||
if (dst_data != src_data) {
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, streams_[kCudaStreamDefault]));
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, GetStream(kCudaStreamDefault)));
|
||||
}
|
||||
} else {
|
||||
// copy from other CPU memory to GPU, this is blocking
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice));
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, GetStream(kCudaStreamDefault)));
|
||||
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(GetStream(kCudaStreamDefault)));
|
||||
}
|
||||
} else if (src_device.Type() == OrtDevice::GPU) {
|
||||
if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
|
||||
// copying from GPU to pinned memory, this is non-blocking
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, streams_[exec_queue_id]));
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, GetStream(exec_queue_id)));
|
||||
} else {
|
||||
// copying from GPU to CPU memory, this is blocking
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, GetStream(kCudaStreamDefault)));
|
||||
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(GetStream(kCudaStreamDefault)));
|
||||
}
|
||||
} else {
|
||||
// copying between cpu memory
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ enum CUDAStreamType : int {
|
|||
|
||||
class GPUDataTransfer : public IDataTransfer {
|
||||
public:
|
||||
GPUDataTransfer(bool do_copy_in_default_stream = true);
|
||||
GPUDataTransfer(cudaStream_t stream, bool do_copy_in_default_stream = true);
|
||||
~GPUDataTransfer();
|
||||
|
||||
bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
|
||||
|
|
@ -32,6 +32,7 @@ class GPUDataTransfer : public IDataTransfer {
|
|||
}
|
||||
|
||||
private:
|
||||
bool do_copy_in_default_stream_;
|
||||
cudaStream_t streams_[kTotalCudaStreams];
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -145,6 +145,7 @@ Status BinaryElementwise<ShouldBroadcast>::Prepare(OpKernelContext* context, Bin
|
|||
BinaryElementwisePreparation prepare; \
|
||||
ORT_RETURN_IF_ERROR(Prepare(context, &prepare)); \
|
||||
Impl_##x<typename ToCudaType<T>::MappedType>( \
|
||||
Stream(), \
|
||||
prepare.output_rank_or_simple_broadcast, \
|
||||
&prepare.lhs_padded_strides, \
|
||||
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(prepare.lhs_tensor->template Data<T>()), \
|
||||
|
|
@ -315,12 +316,13 @@ ONNX_OPERATOR_KERNEL_EX(
|
|||
|
||||
namespace pow12_internal {
|
||||
template <class T>
|
||||
Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) {
|
||||
Status DispatchOnFirstArg(cudaStream_t stream, const BinaryElementwisePreparation& prepare) {
|
||||
namespace on = ONNX_NAMESPACE;
|
||||
Status s;
|
||||
switch (prepare.rhs_tensor->GetElementType()) {
|
||||
case on::TensorProto_DataType_INT32:
|
||||
ImplT1_Pow<typename ToCudaType<T>::MappedType, typename ToCudaType<int32_t>::MappedType>(
|
||||
stream,
|
||||
prepare.output_rank_or_simple_broadcast,
|
||||
&prepare.lhs_padded_strides,
|
||||
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(prepare.lhs_tensor->template Data<T>()),
|
||||
|
|
@ -334,6 +336,7 @@ Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) {
|
|||
break;
|
||||
case on::TensorProto_DataType_INT64:
|
||||
ImplT1_Pow<typename ToCudaType<T>::MappedType, typename ToCudaType<int64_t>::MappedType>(
|
||||
stream,
|
||||
prepare.output_rank_or_simple_broadcast,
|
||||
&prepare.lhs_padded_strides,
|
||||
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(prepare.lhs_tensor->template Data<T>()),
|
||||
|
|
@ -347,6 +350,7 @@ Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) {
|
|||
break;
|
||||
case on::TensorProto_DataType_FLOAT:
|
||||
ImplT1_Pow<typename ToCudaType<T>::MappedType, typename ToCudaType<float>::MappedType>(
|
||||
stream,
|
||||
prepare.output_rank_or_simple_broadcast,
|
||||
&prepare.lhs_padded_strides,
|
||||
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(prepare.lhs_tensor->template Data<T>()),
|
||||
|
|
@ -360,6 +364,7 @@ Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) {
|
|||
break;
|
||||
case on::TensorProto_DataType_DOUBLE:
|
||||
ImplT1_Pow<typename ToCudaType<T>::MappedType, typename ToCudaType<double>::MappedType>(
|
||||
stream,
|
||||
prepare.output_rank_or_simple_broadcast,
|
||||
&prepare.lhs_padded_strides,
|
||||
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(prepare.lhs_tensor->template Data<T>()),
|
||||
|
|
@ -373,6 +378,7 @@ Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) {
|
|||
break;
|
||||
case on::TensorProto_DataType_FLOAT16:
|
||||
ImplT1_Pow<typename ToCudaType<T>::MappedType, typename ToCudaType<MLFloat16>::MappedType>(
|
||||
stream,
|
||||
prepare.output_rank_or_simple_broadcast,
|
||||
&prepare.lhs_padded_strides,
|
||||
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(prepare.lhs_tensor->template Data<T>()),
|
||||
|
|
@ -402,19 +408,19 @@ Status Pow::ComputeInternal(OpKernelContext* context) const {
|
|||
|
||||
switch (prepare.lhs_tensor->GetElementType()) {
|
||||
case on::TensorProto_DataType_INT32:
|
||||
s = DispatchOnFirstArg<int32_t>(prepare);
|
||||
s = DispatchOnFirstArg<int32_t>(Stream(), prepare);
|
||||
break;
|
||||
case on::TensorProto_DataType_INT64:
|
||||
s = DispatchOnFirstArg<int64_t>(prepare);
|
||||
s = DispatchOnFirstArg<int64_t>(Stream(), prepare);
|
||||
break;
|
||||
case on::TensorProto_DataType_FLOAT:
|
||||
s = DispatchOnFirstArg<float>(prepare);
|
||||
s = DispatchOnFirstArg<float>(Stream(), prepare);
|
||||
break;
|
||||
case on::TensorProto_DataType_DOUBLE:
|
||||
s = DispatchOnFirstArg<double>(prepare);
|
||||
s = DispatchOnFirstArg<double>(Stream(), prepare);
|
||||
break;
|
||||
case on::TensorProto_DataType_FLOAT16:
|
||||
s = DispatchOnFirstArg<MLFloat16>(prepare);
|
||||
s = DispatchOnFirstArg<MLFloat16>(Stream(), prepare);
|
||||
break;
|
||||
default:
|
||||
s = ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported X type: ",
|
||||
|
|
@ -431,6 +437,7 @@ Status CompareFunction<T, CudaT>::CompareMethod(OpKernelContext* context, ImplCo
|
|||
ORT_RETURN_IF_ERROR(Prepare(context, &prepare));
|
||||
|
||||
Impl_Compare(
|
||||
Stream(),
|
||||
prepare.output_rank_or_simple_broadcast,
|
||||
&prepare.lhs_padded_strides,
|
||||
reinterpret_cast<const CudaT*>(prepare.lhs_tensor->template Data<T>()),
|
||||
|
|
|
|||
|
|
@ -219,7 +219,8 @@ class CompareFunction : public BinaryElementwise<ShouldBroadcast> {
|
|||
public:
|
||||
CompareFunction(const OpKernelInfo& info) : BinaryElementwise(info) {}
|
||||
|
||||
typedef void (*ImplCompare)(int32_t output_rank_or_simple_broadcast,
|
||||
typedef void (*ImplCompare)(cudaStream_t stream,
|
||||
int32_t output_rank_or_simple_broadcast,
|
||||
const TArray<int64_t>* lhs_padded_strides,
|
||||
const CudaT* lhs_data,
|
||||
const TArray<int64_t>* rhs_padded_strides,
|
||||
|
|
|
|||
|
|
@ -12,7 +12,8 @@ namespace cuda {
|
|||
|
||||
#define BINARY_ELEMENTWISE_IMPL(name) \
|
||||
BINARY_ELEMENTWISE_IMPL_DECLARATION(name) { \
|
||||
BinaryElementWiseImpl(output_rank_or_simple_broadcast, \
|
||||
BinaryElementWiseImpl(stream, \
|
||||
output_rank_or_simple_broadcast, \
|
||||
lhs_padded_strides, \
|
||||
lhs_data, \
|
||||
rhs_padded_strides, \
|
||||
|
|
@ -27,7 +28,8 @@ namespace cuda {
|
|||
|
||||
#define BINARY_ELEMENTWISE_IMPL_T1(name) \
|
||||
BINARY_ELEMENTWISE_IMPL_DECLARATION_T1(name) { \
|
||||
BinaryElementWiseImpl(output_rank_or_simple_broadcast, \
|
||||
BinaryElementWiseImpl(stream, \
|
||||
output_rank_or_simple_broadcast, \
|
||||
lhs_padded_strides, \
|
||||
lhs_data, \
|
||||
rhs_padded_strides, \
|
||||
|
|
@ -42,7 +44,8 @@ namespace cuda {
|
|||
|
||||
#define BINARY_ELEMENTWISE_IMPL_T2(name) \
|
||||
BINARY_ELEMENTWISE_IMPL_DECLARATION_T2(name) { \
|
||||
BinaryElementWiseImpl(output_rank_or_simple_broadcast, \
|
||||
BinaryElementWiseImpl(stream, \
|
||||
output_rank_or_simple_broadcast, \
|
||||
lhs_padded_strides, \
|
||||
lhs_data, \
|
||||
rhs_padded_strides, \
|
||||
|
|
@ -56,19 +59,22 @@ namespace cuda {
|
|||
}
|
||||
|
||||
#define SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, T) \
|
||||
template void Impl_##x<T>(int32_t output_rank, \
|
||||
template void Impl_##x<T>(cudaStream_t stream, \
|
||||
int32_t output_rank, \
|
||||
const TArray<int64_t>* lhs_padded_strides, const T* lhs_data, \
|
||||
const TArray<int64_t>* rhs_padded_strides, const T* rhs_data, \
|
||||
const TArray<fast_divmod>* fdm_output_strides, const fast_divmod& fdm_H, const fast_divmod& fdm_C, T* output_data, size_t count);
|
||||
|
||||
#define SPECIALIZED_BINARY_ELEMENTWISE_IMPL_T1(x, T, T1) \
|
||||
template void ImplT1_##x<T, T1>(int32_t output_rank, \
|
||||
template void ImplT1_##x<T, T1>(cudaStream_t stream, \
|
||||
int32_t output_rank, \
|
||||
const TArray<int64_t>* lhs_padded_strides, const T* lhs_data, \
|
||||
const TArray<int64_t>* rhs_padded_strides, const T1* rhs_data, \
|
||||
const TArray<fast_divmod>* fdm_output_strides, const fast_divmod& fdm_H, const fast_divmod& fdm_C, T* output_data, size_t count);
|
||||
|
||||
#define SPECIALIZED_BINARY_ELEMENTWISE_IMPL_T2(x, T, T1, T2) \
|
||||
template void ImplT2_##x<T, T1, T2>(int32_t output_rank, \
|
||||
template void ImplT2_##x<T, T1, T2>(cudaStream_t stream, \
|
||||
int32_t output_rank, \
|
||||
const TArray<int64_t>* lhs_padded_strides, const T1* lhs_data, \
|
||||
const TArray<int64_t>* rhs_padded_strides, const T2* rhs_data, \
|
||||
const TArray<fast_divmod>* fdm_output_strides, const fast_divmod& fdm_H, const fast_divmod& fdm_C, T* output_data, size_t count);
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ namespace cuda {
|
|||
#define BINARY_ELEMENTWISE_IMPL_DECLARATION(name) \
|
||||
template <typename T> \
|
||||
void Impl_##name( \
|
||||
cudaStream_t stream, \
|
||||
int32_t output_rank_or_simple_broadcast, \
|
||||
const TArray<int64_t>* lhs_padded_strides, \
|
||||
const T* lhs_data, \
|
||||
|
|
@ -52,6 +53,7 @@ BINARY_OPS()
|
|||
#define BINARY_ELEMENTWISE_IMPL_DECLARATION_T1(name) \
|
||||
template <typename T, typename T1> \
|
||||
void ImplT1_##name( \
|
||||
cudaStream_t stream, \
|
||||
int32_t output_rank_or_simple_broadcast, \
|
||||
const TArray<int64_t>* lhs_padded_strides, \
|
||||
const T* lhs_data, \
|
||||
|
|
@ -68,6 +70,7 @@ BINARY_ELEMENTWISE_IMPL_DECLARATION_T1(Pow);
|
|||
#define BINARY_ELEMENTWISE_IMPL_DECLARATION_T2(name) \
|
||||
template <typename T, typename T1, typename T2> \
|
||||
void ImplT2_##name( \
|
||||
cudaStream_t stream, \
|
||||
int32_t output_rank_or_simple_broadcast, \
|
||||
const TArray<int64_t>* lhs_padded_strides, \
|
||||
const T1* lhs_data, \
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ Status Clip_6<T>::ComputeInternal(OpKernelContext* ctx) const {
|
|||
if (count > 0) {
|
||||
auto* y_data = Y->template MutableData<T>();
|
||||
const auto* x_data = X.template Data<T>();
|
||||
ClipImpl<T>(x_data, y_data, this->min_, this->max_, count);
|
||||
ClipImpl<T>(Stream(), x_data, y_data, this->min_, this->max_, count);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
|
@ -91,7 +91,7 @@ struct LowMax<MLFloat16> {
|
|||
|
||||
template <typename T>
|
||||
struct Clip::ComputeImpl {
|
||||
void operator()(const Tensor* X, const Tensor* min, const Tensor* max, Tensor* Y) const {
|
||||
void operator()(cudaStream_t stream, const Tensor* X, const Tensor* min, const Tensor* max, Tensor* Y) const {
|
||||
auto min_val = clip_internal::LowMax<T>::low();
|
||||
auto max_val = clip_internal::LowMax<T>::max();
|
||||
|
||||
|
|
@ -110,7 +110,7 @@ struct Clip::ComputeImpl {
|
|||
if (count > 0) {
|
||||
auto* y_data = Y->template MutableData<T>();
|
||||
const auto* x_data = X->template Data<T>();
|
||||
ClipImpl<T>(x_data, y_data, min_val, max_val, count);
|
||||
ClipImpl<T>(stream, x_data, y_data, min_val, max_val, count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
@ -124,7 +124,7 @@ Status Clip::ComputeInternal(OpKernelContext* ctx) const {
|
|||
utils::MLTypeCallDispatcher<ComputeImpl, float, double, int8_t, uint8_t, int64_t, uint64_t>
|
||||
t_disp(X->GetElementType());
|
||||
|
||||
t_disp.Invoke(X, min, max, Y);
|
||||
t_disp.Invoke(Stream(), X, min, max, Y);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,24 +13,24 @@ __global__ void _Clip(const T* input, T* output, T min, T max, size_t N) {
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
void ClipImpl(const T* input_data, T* output_data, T min, T max, size_t count) {
|
||||
void ClipImpl(cudaStream_t stream, const T* input_data, T* output_data, T min, T max, size_t count) {
|
||||
typedef typename ToCudaType<T>::MappedType CudaT;
|
||||
|
||||
int blocksPerGrid = (int)(ceil(static_cast<float>(count) / GridDim::maxThreadsPerBlock));
|
||||
_Clip<CudaT><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(reinterpret_cast<const CudaT*>(input_data),
|
||||
_Clip<CudaT><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(reinterpret_cast<const CudaT*>(input_data),
|
||||
reinterpret_cast<CudaT*>(output_data),
|
||||
*reinterpret_cast<CudaT*>(&min),
|
||||
*reinterpret_cast<CudaT*>(&max),
|
||||
count);
|
||||
}
|
||||
|
||||
template void ClipImpl<float>(const float* input_data, float* output_data, float min, float max, size_t count);
|
||||
template void ClipImpl<double>(const double* input_data, double* output_data, double min, double max, size_t count);
|
||||
template void ClipImpl<MLFloat16>(const MLFloat16* input_data, MLFloat16* output_data, MLFloat16 min, MLFloat16 max, size_t count);
|
||||
template void ClipImpl<int8_t>(const int8_t* input_data, int8_t* output_data, int8_t min, int8_t max, size_t count);
|
||||
template void ClipImpl<uint8_t>(const uint8_t* input_data, uint8_t* output_data, uint8_t min, uint8_t max, size_t count);
|
||||
template void ClipImpl<int64_t>(const int64_t* input_data, int64_t* output_data, int64_t min, int64_t max, size_t count);
|
||||
template void ClipImpl<uint64_t>(const uint64_t* input_data, uint64_t* output_data, uint64_t min, uint64_t max, size_t count);
|
||||
template void ClipImpl<float>(cudaStream_t stream, const float* input_data, float* output_data, float min, float max, size_t count);
|
||||
template void ClipImpl<double>(cudaStream_t stream, const double* input_data, double* output_data, double min, double max, size_t count);
|
||||
template void ClipImpl<MLFloat16>(cudaStream_t stream, const MLFloat16* input_data, MLFloat16* output_data, MLFloat16 min, MLFloat16 max, size_t count);
|
||||
template void ClipImpl<int8_t>(cudaStream_t stream, const int8_t* input_data, int8_t* output_data, int8_t min, int8_t max, size_t count);
|
||||
template void ClipImpl<uint8_t>(cudaStream_t stream, const uint8_t* input_data, uint8_t* output_data, uint8_t min, uint8_t max, size_t count);
|
||||
template void ClipImpl<int64_t>(cudaStream_t stream, const int64_t* input_data, int64_t* output_data, int64_t min, int64_t max, size_t count);
|
||||
template void ClipImpl<uint64_t>(cudaStream_t stream, const uint64_t* input_data, uint64_t* output_data, uint64_t min, uint64_t max, size_t count);
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@
|
|||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
template <typename T>
|
||||
void ClipImpl(const T* input_data, T* output_data, T min, T max, size_t count);
|
||||
void ClipImpl(cudaStream_t stream, const T* input_data, T* output_data, T min, T max, size_t count);
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -77,7 +77,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const {
|
|||
fast_divmod fast_divmod_input_stride_along_axis(static_cast<int>(input_stride_along_axis));
|
||||
|
||||
if (input->IsDataType<float>()) {
|
||||
CumSumImpl(reinterpret_cast<const typename ToCudaType<float>::MappedType*>(input->Data<float>()),
|
||||
CumSumImpl(Stream(), reinterpret_cast<const typename ToCudaType<float>::MappedType*>(input->Data<float>()),
|
||||
fast_divmod_input_dim_along_axis,
|
||||
fast_divmod_input_stride_along_axis,
|
||||
reinterpret_cast<typename ToCudaType<float>::MappedType*>(output.MutableData<float>()),
|
||||
|
|
@ -85,7 +85,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const {
|
|||
exclusive_,
|
||||
reverse_);
|
||||
} else if (input->IsDataType<double>()) {
|
||||
CumSumImpl(reinterpret_cast<const typename ToCudaType<double>::MappedType*>(input->Data<double>()),
|
||||
CumSumImpl(Stream(), reinterpret_cast<const typename ToCudaType<double>::MappedType*>(input->Data<double>()),
|
||||
fast_divmod_input_dim_along_axis,
|
||||
fast_divmod_input_stride_along_axis,
|
||||
reinterpret_cast<typename ToCudaType<double>::MappedType*>(output.MutableData<double>()),
|
||||
|
|
@ -93,7 +93,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const {
|
|||
exclusive_,
|
||||
reverse_);
|
||||
} else if (input->IsDataType<int32_t>()) {
|
||||
CumSumImpl(reinterpret_cast<const typename ToCudaType<int32_t>::MappedType*>(input->Data<int32_t>()),
|
||||
CumSumImpl(Stream(), reinterpret_cast<const typename ToCudaType<int32_t>::MappedType*>(input->Data<int32_t>()),
|
||||
fast_divmod_input_dim_along_axis,
|
||||
fast_divmod_input_stride_along_axis,
|
||||
reinterpret_cast<typename ToCudaType<int32_t>::MappedType*>(output.MutableData<int32_t>()),
|
||||
|
|
@ -101,7 +101,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const {
|
|||
exclusive_,
|
||||
reverse_);
|
||||
} else if (input->IsDataType<int64_t>()) {
|
||||
CumSumImpl(reinterpret_cast<const typename ToCudaType<int64_t>::MappedType*>(input->Data<int64_t>()),
|
||||
CumSumImpl(Stream(), reinterpret_cast<const typename ToCudaType<int64_t>::MappedType*>(input->Data<int64_t>()),
|
||||
fast_divmod_input_dim_along_axis,
|
||||
fast_divmod_input_stride_along_axis,
|
||||
reinterpret_cast<typename ToCudaType<int64_t>::MappedType*>(output.MutableData<int64_t>()),
|
||||
|
|
@ -109,7 +109,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const {
|
|||
exclusive_,
|
||||
reverse_);
|
||||
} else if (input->IsDataType<uint32_t>()) {
|
||||
CumSumImpl(reinterpret_cast<const typename ToCudaType<uint32_t>::MappedType*>(input->Data<uint32_t>()),
|
||||
CumSumImpl(Stream(), reinterpret_cast<const typename ToCudaType<uint32_t>::MappedType*>(input->Data<uint32_t>()),
|
||||
fast_divmod_input_dim_along_axis,
|
||||
fast_divmod_input_stride_along_axis,
|
||||
reinterpret_cast<typename ToCudaType<uint32_t>::MappedType*>(output.MutableData<uint32_t>()),
|
||||
|
|
@ -117,7 +117,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const {
|
|||
exclusive_,
|
||||
reverse_);
|
||||
} else if (input->IsDataType<uint64_t>()) {
|
||||
CumSumImpl(reinterpret_cast<const typename ToCudaType<uint64_t>::MappedType*>(input->Data<uint64_t>()),
|
||||
CumSumImpl(Stream(), reinterpret_cast<const typename ToCudaType<uint64_t>::MappedType*>(input->Data<uint64_t>()),
|
||||
fast_divmod_input_dim_along_axis,
|
||||
fast_divmod_input_stride_along_axis,
|
||||
reinterpret_cast<typename ToCudaType<uint64_t>::MappedType*>(output.MutableData<uint64_t>()),
|
||||
|
|
@ -125,7 +125,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const {
|
|||
exclusive_,
|
||||
reverse_);
|
||||
} else if (input->IsDataType<MLFloat16>()) {
|
||||
CumSumImpl(reinterpret_cast<const typename ToCudaType<MLFloat16>::MappedType*>(input->Data<MLFloat16>()),
|
||||
CumSumImpl(Stream(), reinterpret_cast<const typename ToCudaType<MLFloat16>::MappedType*>(input->Data<MLFloat16>()),
|
||||
fast_divmod_input_dim_along_axis,
|
||||
fast_divmod_input_stride_along_axis,
|
||||
reinterpret_cast<typename ToCudaType<MLFloat16>::MappedType*>(output.MutableData<MLFloat16>()),
|
||||
|
|
|
|||
|
|
@ -71,6 +71,7 @@ __global__ void _CumSumKernel(
|
|||
|
||||
template <typename T>
|
||||
void CumSumImpl(
|
||||
cudaStream_t stream,
|
||||
const T* input_data,
|
||||
const fast_divmod& input_dim_along_axis,
|
||||
const fast_divmod& input_stride_along_axis,
|
||||
|
|
@ -81,7 +82,7 @@ void CumSumImpl(
|
|||
if (output_size > 0) {
|
||||
int blocksPerGrid = static_cast<int>((output_size + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock);
|
||||
|
||||
_CumSumKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(input_data,
|
||||
_CumSumKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(input_data,
|
||||
input_dim_along_axis,
|
||||
input_stride_along_axis,
|
||||
output_data,
|
||||
|
|
@ -92,6 +93,7 @@ void CumSumImpl(
|
|||
}
|
||||
|
||||
template void CumSumImpl<int32_t>(
|
||||
cudaStream_t stream,
|
||||
const int32_t* input_data,
|
||||
const fast_divmod& input_dim_along_axis,
|
||||
const fast_divmod& input_stride_along_axis,
|
||||
|
|
@ -101,6 +103,7 @@ template void CumSumImpl<int32_t>(
|
|||
bool reverse);
|
||||
|
||||
template void CumSumImpl<int64_t>(
|
||||
cudaStream_t stream,
|
||||
const int64_t* input_data,
|
||||
const fast_divmod& input_dim_along_axis,
|
||||
const fast_divmod& input_stride_along_axis,
|
||||
|
|
@ -110,6 +113,7 @@ template void CumSumImpl<int64_t>(
|
|||
bool reverse);
|
||||
|
||||
template void CumSumImpl<uint32_t>(
|
||||
cudaStream_t stream,
|
||||
const uint32_t* input_data,
|
||||
const fast_divmod& input_dim_along_axis,
|
||||
const fast_divmod& input_stride_along_axis,
|
||||
|
|
@ -119,6 +123,7 @@ template void CumSumImpl<uint32_t>(
|
|||
bool reverse);
|
||||
|
||||
template void CumSumImpl<uint64_t>(
|
||||
cudaStream_t stream,
|
||||
const uint64_t* input_data,
|
||||
const fast_divmod& input_dim_along_axis,
|
||||
const fast_divmod& input_stride_along_axis,
|
||||
|
|
@ -128,6 +133,7 @@ template void CumSumImpl<uint64_t>(
|
|||
bool reverse);
|
||||
|
||||
template void CumSumImpl<float>(
|
||||
cudaStream_t stream,
|
||||
const float* input_data,
|
||||
const fast_divmod& input_dim_along_axis,
|
||||
const fast_divmod& input_stride_along_axis,
|
||||
|
|
@ -137,6 +143,7 @@ template void CumSumImpl<float>(
|
|||
bool reverse);
|
||||
|
||||
template void CumSumImpl<double>(
|
||||
cudaStream_t stream,
|
||||
const double* input_data,
|
||||
const fast_divmod& input_dim_along_axis,
|
||||
const fast_divmod& input_stride_along_axis,
|
||||
|
|
@ -146,6 +153,7 @@ template void CumSumImpl<double>(
|
|||
bool reverse);
|
||||
|
||||
template void CumSumImpl<half>(
|
||||
cudaStream_t stream,
|
||||
const half* input_data,
|
||||
const fast_divmod& input_dim_along_axis,
|
||||
const fast_divmod& input_stride_along_axis,
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ namespace cuda {
|
|||
|
||||
template <typename T>
|
||||
void CumSumImpl(
|
||||
cudaStream_t stream,
|
||||
const T* input_data,
|
||||
const fast_divmod& input_dim_along_axis,
|
||||
const fast_divmod& input_stride_along_axis,
|
||||
|
|
|
|||
|
|
@ -12,14 +12,15 @@ namespace DeviceHelpers {
|
|||
namespace CudaDeviceHelpers {
|
||||
|
||||
// CUDA EP specific Data copy helper
|
||||
Status DataCopy(const Tensor& input, Tensor& output) {
|
||||
Status DataCopy(const Tensor& input, Tensor& output, void* einsum_cuda_assets) {
|
||||
ORT_ENFORCE(output.SizeInBytes() == input.SizeInBytes(),
|
||||
"Einsum op: The candidate output does not match the actual output's shape");
|
||||
// There are no string tensors in Einsum's case - so safely use memcpy
|
||||
// TODO: Currently, triggers copy on stream 0, investigate if we can still do that
|
||||
// *if* the kernel is launched in a different stream
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output.MutableDataRaw(), input.DataRaw(), input.SizeInBytes(),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
cudaMemcpyDeviceToDevice,
|
||||
static_cast<cudaStream_t>(static_cast<EinsumCudaAssets*>(einsum_cuda_assets)->cuda_ep_->GetComputeStream())));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
|
@ -28,6 +29,7 @@ Status DataCopy(const Tensor& input, Tensor& output) {
|
|||
Status Transpose(const std::vector<size_t>& permutation, const Tensor& input,
|
||||
Tensor& output, const TensorShape* input_shape_override, void* einsum_cuda_assets) {
|
||||
return cuda::Transpose::DoTranspose(static_cast<EinsumCudaAssets*>(einsum_cuda_assets)->cuda_ep_->GetDeviceProp(),
|
||||
static_cast<cudaStream_t>(static_cast<EinsumCudaAssets*>(einsum_cuda_assets)->cuda_ep_->GetComputeStream()),
|
||||
static_cast<EinsumCudaAssets*>(einsum_cuda_assets)->cublas_handle_,
|
||||
permutation, input, output, input_shape_override);
|
||||
}
|
||||
|
|
@ -79,7 +81,7 @@ Tensor ReduceSum(const Tensor& input, const std::vector<int64_t>& reduce_axes,
|
|||
}
|
||||
|
||||
// CUDA EP specific Diagonal helper
|
||||
std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator) {
|
||||
std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator, void* einsum_cuda_assets) {
|
||||
const auto& input_shape = input.Shape();
|
||||
const auto& input_dims = input_shape.GetDims();
|
||||
auto rank = static_cast<int64_t>(input_dims.size());
|
||||
|
|
@ -117,6 +119,7 @@ std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim
|
|||
}
|
||||
|
||||
DiagonalImpl(
|
||||
static_cast<cudaStream_t>(static_cast<EinsumCudaAssets*>(einsum_cuda_assets)->cuda_ep_->GetComputeStream()),
|
||||
input.DataRaw(),
|
||||
input.Shape().GetDims().size(),
|
||||
first_dim,
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ namespace CudaDeviceHelpers {
|
|||
Status Transpose(const std::vector<size_t>& permutation, const Tensor& input,
|
||||
Tensor& output, const TensorShape* input_shape_override, void* einsum_cuda_assets);
|
||||
|
||||
Status DataCopy(const Tensor& input, Tensor& output);
|
||||
Status DataCopy(const Tensor& input, Tensor& output, void* einsum_cuda_assets);
|
||||
|
||||
template <typename T>
|
||||
Status MatMul(const T* input_1_data, const T* input_2_data, T* output_data,
|
||||
|
|
@ -52,7 +52,7 @@ Tensor ReduceSum(const Tensor& input, const std::vector<int64_t>& reduce_axes,
|
|||
const TensorShape* input_shape_override,
|
||||
concurrency::ThreadPool* /*tp*/, void* einsum_cuda_assets);
|
||||
|
||||
std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator);
|
||||
std::unique_ptr<Tensor> Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator, void* einsum_cuda_assets);
|
||||
|
||||
} // namespace CudaDeviceHelpers
|
||||
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ __global__ void _DiagonalKernel(
|
|||
}
|
||||
|
||||
void DiagonalImpl(
|
||||
cudaStream_t stream,
|
||||
const void* input_data,
|
||||
const int64_t input_rank,
|
||||
const int64_t dim_1,
|
||||
|
|
@ -61,14 +62,14 @@ void DiagonalImpl(
|
|||
|
||||
switch (element_size) {
|
||||
case sizeof(int32_t):
|
||||
_DiagonalKernel<int32_t><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
_DiagonalKernel<int32_t><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
reinterpret_cast<const ToCudaType<int32_t>::MappedType*>(input_data), input_rank, dim_1, dim_2,
|
||||
input_strides, reinterpret_cast<ToCudaType<int32_t>::MappedType*>(output_data), output_strides,
|
||||
output_size);
|
||||
break;
|
||||
|
||||
case sizeof(int64_t):
|
||||
_DiagonalKernel<int64_t><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
_DiagonalKernel<int64_t><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
reinterpret_cast<const ToCudaType<int64_t>::MappedType*>(input_data), input_rank, dim_1, dim_2,
|
||||
input_strides, reinterpret_cast<ToCudaType<int64_t>::MappedType*>(output_data), output_strides,
|
||||
output_size);
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ namespace onnxruntime {
|
|||
namespace cuda {
|
||||
|
||||
void DiagonalImpl(
|
||||
cudaStream_t stream,
|
||||
const void* input_data,
|
||||
const int64_t input_rank,
|
||||
const int64_t dim_1,
|
||||
|
|
|
|||
|
|
@ -86,6 +86,7 @@ Status Gemm<T>::ComputeInternal(OpKernelContext* ctx) const {
|
|||
if (b_shape.Size() == 1) {
|
||||
// if B is (), (1,) or (1, 1), broadcast the scalar
|
||||
CUBLAS_RETURN_IF_ERROR(cublasCopyHelper(
|
||||
Stream(),
|
||||
CublasHandle(),
|
||||
M * N,
|
||||
b_data,
|
||||
|
|
@ -118,7 +119,7 @@ Status Gemm<T>::ComputeInternal(OpKernelContext* ctx) const {
|
|||
out_data, N, device_prop));
|
||||
} else {
|
||||
// B is (M, N), no broadcast needed.
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(out_data, b_data, M * N * sizeof(T), cudaMemcpyDeviceToDevice));
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(out_data, b_data, M * N * sizeof(T), cudaMemcpyDeviceToDevice, Stream()));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -70,19 +70,20 @@ Status MatMulInteger<int8_t, int8_t>::ComputeInternal(OpKernelContext* ctx) cons
|
|||
IAllocatorUniquePtr<int32_t> a_row_buf;
|
||||
if (b_offset != 0) {
|
||||
a_row_buf = GetScratchBuffer<int32_t>(helper.OutputShape().Size() / helper.N());
|
||||
ORT_RETURN_IF_ERROR(ReduceRowSumOnMatrixA(a_ptr, a_row_buf.get(), b_offset, helper));
|
||||
ORT_RETURN_IF_ERROR(ReduceRowSumOnMatrixA(Stream(), a_ptr, a_row_buf.get(), b_offset, helper));
|
||||
}
|
||||
|
||||
IAllocatorUniquePtr<int32_t> b_col_buf;
|
||||
if (a_offset != 0) {
|
||||
b_col_buf = GetScratchBuffer<int32_t>(helper.OutputShape().Size() / helper.M());
|
||||
ORT_RETURN_IF_ERROR(ReduceColSumOnMatrixB(b_ptr, b_col_buf.get(), a_offset, helper));
|
||||
ORT_RETURN_IF_ERROR(ReduceColSumOnMatrixB(Stream(), b_ptr, b_col_buf.get(), a_offset, helper));
|
||||
}
|
||||
|
||||
int alpha = 1;
|
||||
int beta = 0;
|
||||
if (a_offset != 0 || b_offset != 0) {
|
||||
OffsetOutput(a_row_buf.get(),
|
||||
OffsetOutput(Stream(),
|
||||
a_row_buf.get(),
|
||||
b_col_buf.get(),
|
||||
output_ptr,
|
||||
a_offset,
|
||||
|
|
|
|||
|
|
@ -26,9 +26,9 @@ __global__ void ReduceRowSumOnMatrixAKernel(const int8_t* matrix, int32_t* row_s
|
|||
}
|
||||
}
|
||||
|
||||
Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper) {
|
||||
Status ReduceRowSumOnMatrixA(cudaStream_t stream, const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper) {
|
||||
for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
|
||||
ReduceRowSumOnMatrixAKernel<static_cast<int>(GridDim::maxThreadsPerBlock)><<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0>>>(matrix + helper.LeftOffsets()[batch],
|
||||
ReduceRowSumOnMatrixAKernel<static_cast<int>(GridDim::maxThreadsPerBlock)><<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(matrix + helper.LeftOffsets()[batch],
|
||||
row_sum + batch * helper.M(),
|
||||
offset,
|
||||
static_cast<int>(helper.K()));
|
||||
|
|
@ -54,9 +54,9 @@ __global__ void ReduceColSumOnMatrixBKernel(const int8_t* matrix, int32_t* col_s
|
|||
}
|
||||
}
|
||||
|
||||
Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper) {
|
||||
Status ReduceColSumOnMatrixB(cudaStream_t stream, const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper) {
|
||||
for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
|
||||
ReduceColSumOnMatrixBKernel<static_cast<int>(GridDim::maxThreadsPerBlock)><<<static_cast<int>(helper.N()), GridDim::maxThreadsPerBlock, 0>>>(matrix + helper.RightOffsets()[batch],
|
||||
ReduceColSumOnMatrixBKernel<static_cast<int>(GridDim::maxThreadsPerBlock)><<<static_cast<int>(helper.N()), GridDim::maxThreadsPerBlock, 0, stream>>>(matrix + helper.RightOffsets()[batch],
|
||||
col_sum + batch * helper.N(),
|
||||
offset,
|
||||
static_cast<int32_t>(helper.K()),
|
||||
|
|
@ -92,7 +92,8 @@ __global__ void ComputeOffsetOfMatrixB(const int32_t* row_sum,
|
|||
}
|
||||
}
|
||||
|
||||
Status OffsetOutput(const int32_t* row_sum,
|
||||
Status OffsetOutput(cudaStream_t stream,
|
||||
const int32_t* row_sum,
|
||||
const int32_t* col_sum,
|
||||
int32_t* output,
|
||||
const int8_t a_offset,
|
||||
|
|
@ -100,7 +101,7 @@ Status OffsetOutput(const int32_t* row_sum,
|
|||
const MatMulComputeHelper& helper) {
|
||||
if (a_offset && b_offset) {
|
||||
for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
|
||||
ComputeOffsetOfMatrixAB<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0>>>(
|
||||
ComputeOffsetOfMatrixAB<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
row_sum + batch * helper.M(),
|
||||
col_sum + batch * helper.N(),
|
||||
output + helper.OutputOffsets()[batch],
|
||||
|
|
@ -109,14 +110,14 @@ Status OffsetOutput(const int32_t* row_sum,
|
|||
}
|
||||
} else if (a_offset) {
|
||||
for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
|
||||
ComputeOffsetOfMatrixA<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0>>>(
|
||||
ComputeOffsetOfMatrixA<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
col_sum + batch * helper.N(),
|
||||
output + helper.OutputOffsets()[batch],
|
||||
static_cast<int32_t>(helper.N()));
|
||||
}
|
||||
} else if (b_offset) {
|
||||
for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
|
||||
ComputeOffsetOfMatrixB<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0>>>(
|
||||
ComputeOffsetOfMatrixB<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
row_sum + batch * helper.M(),
|
||||
output + helper.OutputOffsets()[batch],
|
||||
static_cast<int32_t>(helper.N()));
|
||||
|
|
|
|||
|
|
@ -11,9 +11,10 @@
|
|||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper);
|
||||
Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper);
|
||||
Status OffsetOutput(const int32_t* row_sum,
|
||||
Status ReduceRowSumOnMatrixA(cudaStream_t stream, const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper);
|
||||
Status ReduceColSumOnMatrixB(cudaStream_t stream, const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper);
|
||||
Status OffsetOutput(cudaStream_t stream,
|
||||
const int32_t* row_sum,
|
||||
const int32_t* col_sum,
|
||||
int32_t* output,
|
||||
const int8_t a_offset,
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ namespace cuda {
|
|||
|
||||
template <typename T, bool is_log_softmax>
|
||||
Status SoftMaxComputeHelper(
|
||||
cudaStream_t stream,
|
||||
const T* X,
|
||||
const TensorShape& input_shape,
|
||||
T* Y,
|
||||
|
|
@ -28,7 +29,7 @@ Status SoftMaxComputeHelper(
|
|||
// cudnnSoftmaxForward/Backward is not optimal implementation.
|
||||
// TODO: remove cudnn path completely in the future.
|
||||
if (D <= 1024 && D * sizeof(T) <= 4096) {
|
||||
dispatch_softmax_forward<CudaT, CudaT, AccumulationType_t<CudaT>, is_log_softmax>(Y_data, X_data, gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(N));
|
||||
dispatch_softmax_forward<CudaT, CudaT, AccumulationType_t<CudaT>, is_log_softmax>(stream, Y_data, X_data, gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(N));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
|
@ -50,8 +51,8 @@ Status SoftMaxComputeHelper(
|
|||
}
|
||||
|
||||
#define SPECIALIZED_SOFTMAX_HELPER_IMPL(T) \
|
||||
template Status SoftMaxComputeHelper<T, false>(const T* input, const TensorShape& shape, T* Y, cudnnHandle_t handle, int64_t axis); \
|
||||
template Status SoftMaxComputeHelper<T, true>(const T* input, const TensorShape& shape, T* Y, cudnnHandle_t handle, int64_t axis);
|
||||
template Status SoftMaxComputeHelper<T, false>(cudaStream_t stream, const T* input, const TensorShape& shape, T* Y, cudnnHandle_t handle, int64_t axis); \
|
||||
template Status SoftMaxComputeHelper<T, true>(cudaStream_t stream, const T* input, const TensorShape& shape, T* Y, cudnnHandle_t handle, int64_t axis);
|
||||
|
||||
SPECIALIZED_SOFTMAX_HELPER_IMPL(float)
|
||||
SPECIALIZED_SOFTMAX_HELPER_IMPL(double)
|
||||
|
|
@ -62,6 +63,7 @@ SPECIALIZED_SOFTMAX_HELPER_IMPL(MLFloat16)
|
|||
#define SPECIALIZED_SOFTMAX_HELPER_IMPL_BFloat16(is_log_softmax) \
|
||||
template <> \
|
||||
Status SoftMaxComputeHelper<BFloat16, is_log_softmax>( \
|
||||
cudaStream_t stream, \
|
||||
const BFloat16* X, \
|
||||
const TensorShape& input_shape, \
|
||||
BFloat16* Y, \
|
||||
|
|
@ -73,7 +75,7 @@ SPECIALIZED_SOFTMAX_HELPER_IMPL(MLFloat16)
|
|||
auto Y_data = reinterpret_cast<CudaT*>(Y); \
|
||||
auto X_data = reinterpret_cast<const CudaT*>(X); \
|
||||
dispatch_softmax_forward<CudaT, CudaT, AccumulationType_t<CudaT>, is_log_softmax>( \
|
||||
Y_data, X_data, gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(N)); \
|
||||
stream, Y_data, X_data, gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(D), gsl::narrow_cast<int>(N)); \
|
||||
return Status::OK(); \
|
||||
}
|
||||
|
||||
|
|
@ -183,6 +185,7 @@ Status Softmax<T>::ComputeInternal(OpKernelContext* ctx) const {
|
|||
|
||||
// Perform the transpose
|
||||
ORT_RETURN_IF_ERROR(Transpose::DoTranspose(cuda_ep_->GetDeviceProp(),
|
||||
Stream(),
|
||||
CublasHandle(),
|
||||
permutation, *X, temp_input));
|
||||
transposed_input = std::move(temp_input);
|
||||
|
|
@ -208,11 +211,11 @@ Status Softmax<T>::ComputeInternal(OpKernelContext* ctx) const {
|
|||
|
||||
Status status;
|
||||
if (log_softmax_) {
|
||||
status = SoftMaxComputeHelper<T, true>(X_data, *compute_input_shape, Y_data, CudnnHandle(),
|
||||
status = SoftMaxComputeHelper<T, true>(Stream(), X_data, *compute_input_shape, Y_data, CudnnHandle(),
|
||||
is_transpose_required ? static_cast<int64_t>(rank) - 1
|
||||
: static_cast<int64_t>(axis));
|
||||
} else {
|
||||
status = SoftMaxComputeHelper<T, false>(X_data, *compute_input_shape, Y_data, CudnnHandle(),
|
||||
status = SoftMaxComputeHelper<T, false>(Stream(), X_data, *compute_input_shape, Y_data, CudnnHandle(),
|
||||
is_transpose_required ? static_cast<int64_t>(rank) - 1
|
||||
: static_cast<int64_t>(axis));
|
||||
}
|
||||
|
|
@ -227,6 +230,7 @@ Status Softmax<T>::ComputeInternal(OpKernelContext* ctx) const {
|
|||
}
|
||||
// Perform the transpose to get the axes back to the original ordering
|
||||
ORT_RETURN_IF_ERROR(Transpose::DoTranspose(cuda_ep_->GetDeviceProp(),
|
||||
Stream(),
|
||||
CublasHandle(),
|
||||
reverse_permutation, intermediate_output, *Y));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ namespace cuda {
|
|||
|
||||
template <typename T, bool is_log_softmax>
|
||||
Status SoftMaxComputeHelper(
|
||||
cudaStream_t stream,
|
||||
const T* input,
|
||||
const TensorShape& shape,
|
||||
T* Y,
|
||||
|
|
@ -18,7 +19,7 @@ Status SoftMaxComputeHelper(
|
|||
int64_t axis);
|
||||
|
||||
template <typename input_t, typename output_t, typename acc_t, bool is_log_softmax>
|
||||
void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count);
|
||||
void dispatch_softmax_forward(cudaStream_t stream, output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count);
|
||||
|
||||
template <typename T>
|
||||
class Softmax final : public CudaKernel {
|
||||
|
|
|
|||
|
|
@ -135,7 +135,7 @@ __global__ void softmax_warp_forward(output_t* dst, const input_t* src, int batc
|
|||
}
|
||||
|
||||
template <typename input_t, typename output_t, typename acc_t, bool is_log_softmax>
|
||||
void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count) {
|
||||
void dispatch_softmax_forward(cudaStream_t stream, output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count) {
|
||||
if (softmax_elements == 0) {
|
||||
return;
|
||||
} else {
|
||||
|
|
@ -159,47 +159,47 @@ void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_ele
|
|||
switch (log2_elements) {
|
||||
case 0: // 1
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 0, is_log_softmax>
|
||||
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 1: // 2
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 1, is_log_softmax>
|
||||
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 2: // 4
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 2, is_log_softmax>
|
||||
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 3: // 8
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 3, is_log_softmax>
|
||||
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 4: // 16
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 4, is_log_softmax>
|
||||
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 5: // 32
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 5, is_log_softmax>
|
||||
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 6: // 64
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 6, is_log_softmax>
|
||||
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 7: // 128
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 7, is_log_softmax>
|
||||
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 8: // 256
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 8, is_log_softmax>
|
||||
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 9: // 512
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 9, is_log_softmax>
|
||||
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
case 10: // 1024
|
||||
softmax_warp_forward<input_t, output_t, acc_t, 10, is_log_softmax>
|
||||
<<<blocks, threads, 0>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
<<<blocks, threads, 0, stream>>>(dst, src, batch_count, softmax_elements_stride, softmax_elements);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
|
@ -208,8 +208,8 @@ void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_ele
|
|||
}
|
||||
|
||||
#define SPECIALIZED_SOFTMAX_IMPL(input_t, output_t, acc_t) \
|
||||
template void dispatch_softmax_forward<input_t, output_t, acc_t, false>(output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); \
|
||||
template void dispatch_softmax_forward<input_t, output_t, acc_t, true>(output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count);
|
||||
template void dispatch_softmax_forward<input_t, output_t, acc_t, false>(cudaStream_t stream, output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); \
|
||||
template void dispatch_softmax_forward<input_t, output_t, acc_t, true>(cudaStream_t stream, output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count);
|
||||
|
||||
SPECIALIZED_SOFTMAX_IMPL(float, float, float)
|
||||
SPECIALIZED_SOFTMAX_IMPL(half, half, float)
|
||||
|
|
|
|||
|
|
@ -419,23 +419,24 @@ __global__ void ExcludeOutput(int64_t* output_i, int64_t K, int64_t dimension) {
|
|||
template <typename T>
|
||||
Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t* output_i, const TArray<int64_t>& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension) {
|
||||
typedef typename ToCudaType<T>::MappedType CudaT;
|
||||
cudaStream_t stream = kernel->Stream();
|
||||
const CudaT* input_x_ptr = reinterpret_cast<const CudaT*>(input_x);
|
||||
CudaT* output_v_ptr = reinterpret_cast<CudaT*>(output_v);
|
||||
|
||||
auto aligned_K = ALIGN(K);
|
||||
auto aligned_dimension = ALIGN(dimension);
|
||||
if (aligned_dimension <= GridDim::maxThreadsPerBlock) {
|
||||
BitonicTopK<CudaT><<<N, GridDim::maxThreadsPerBlock, aligned_dimension * sizeof(KV<CudaT>)>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension, aligned_dimension, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
|
||||
BitonicTopK<CudaT><<<N, GridDim::maxThreadsPerBlock, aligned_dimension * sizeof(KV<CudaT>), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension, aligned_dimension, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
|
||||
} else if (K <= BT*16 || 0 == sorted) {
|
||||
auto XPT = static_cast<int64_t>(ceil(static_cast<double>(dimension) / GridDim::maxThreadsPerBlock));
|
||||
if (BT*2 >= K || 0 == sorted) {
|
||||
RadixTopK<CudaT, BT, 2><<<N, BT, 256 * sizeof(uint32_t)>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
|
||||
RadixTopK<CudaT, BT, 2><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
|
||||
} else if (BT*4>=K) {
|
||||
RadixTopK<CudaT, BT, 4><<<N, BT, 256 * sizeof(uint32_t)>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
|
||||
RadixTopK<CudaT, BT, 4><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
|
||||
} else if (BT*8>=K) {
|
||||
RadixTopK<CudaT, BT, 8><<<N, BT, 256 * sizeof(uint32_t)>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
|
||||
RadixTopK<CudaT, BT, 8><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
|
||||
} else {
|
||||
RadixTopK<CudaT, BT, 16><<<N, BT, 256 * sizeof(uint32_t)>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
|
||||
RadixTopK<CudaT, BT, 16><<<N, BT, 256 * sizeof(uint32_t), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits<T>::Lowest(), NumericLimits<T>::Max());
|
||||
}
|
||||
} else {
|
||||
auto input_key_buffer = kernel->GetScratchBuffer<CudaT>(dimension);
|
||||
|
|
@ -447,21 +448,21 @@ Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t
|
|||
auto* input_value = input_value_buffer.get();
|
||||
auto* output_value = output_value_buffer.get();
|
||||
size_t temp_bytes = 0;
|
||||
CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, input_key, output_key, input_value, output_value, dimension));
|
||||
CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, input_key, output_key, input_value, output_value, dimension, 0, sizeof(T)*8, stream));
|
||||
auto temp_storage_buffer = kernel->GetScratchBuffer<char>(temp_bytes);
|
||||
auto* temp_storage = temp_storage_buffer.get();
|
||||
auto blocks_per_grid_D = (int)(ceil(static_cast<float>(dimension) / BT));
|
||||
auto blocks_per_grid_K = (int)(ceil(static_cast<float>(K) / BT));
|
||||
for (int64_t i = 0; i < N; i++) {
|
||||
FillInput<CudaT><<<blocks_per_grid_D, BT, 0>>>(input_x_ptr, input_key, input_value, elem_nums, size, axis, K, i, dimension);
|
||||
CUDA_RETURN_IF_ERROR(1 == largest ? cub::DeviceRadixSort::SortPairsDescending(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension)
|
||||
: cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension));
|
||||
FillInput<CudaT><<<blocks_per_grid_D, BT, 0, stream>>>(input_x_ptr, input_key, input_value, elem_nums, size, axis, K, i, dimension);
|
||||
CUDA_RETURN_IF_ERROR(1 == largest ? cub::DeviceRadixSort::SortPairsDescending(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension, 0, sizeof(T)*8, stream)
|
||||
: cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension, 0, sizeof(T)*8, stream));
|
||||
if (1 == sorted) {
|
||||
FillOutput<CudaT><<<blocks_per_grid_K, BT, 0>>>(output_key, output_value, output_v_ptr, output_i, elem_nums, size, axis, K, i, dimension);
|
||||
FillOutput<CudaT><<<blocks_per_grid_K, BT, 0, stream>>>(output_key, output_value, output_v_ptr, output_i, elem_nums, size, axis, K, i, dimension);
|
||||
} else { //reorder by ascending index
|
||||
ExcludeOutput<<<blocks_per_grid_D, BT, 0>>>(output_value, K, dimension);
|
||||
CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, output_value, input_value, output_key, input_key, dimension));
|
||||
FillOutput<CudaT><<<blocks_per_grid_K, BT, 0>>>(input_key, input_value, output_v_ptr, output_i, elem_nums, size, axis, K, i, dimension);
|
||||
ExcludeOutput<<<blocks_per_grid_D, BT, 0, stream>>>(output_value, K, dimension);
|
||||
CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, output_value, input_value, output_key, input_key, dimension, 0, sizeof(T)*8, stream));
|
||||
FillOutput<CudaT><<<blocks_per_grid_K, BT, 0, stream>>>(input_key, input_value, output_v_ptr, output_i, elem_nums, size, axis, K, i, dimension);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -50,6 +50,7 @@ Status UnaryElementwise::Prepare(OpKernelContext* context, UnaryElementwisePrepa
|
|||
UnaryElementwisePreparation p; \
|
||||
ORT_RETURN_IF_ERROR(UnaryElementwise::Prepare(context, &p)); \
|
||||
Impl_##x( \
|
||||
Stream(), \
|
||||
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(p.input_tensor->template Data<T>()), \
|
||||
reinterpret_cast<typename ToCudaType<T>::MappedType*>(p.output_tensor->template MutableData<T>()), \
|
||||
p.output_tensor->Shape().Size()); \
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue