mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-20 02:07:56 +00:00
[ROCm] Enable Sampling Op UT on AMD (#14581)
Making basic porting effort to run Sampling UT on ROCm ep, based on the commits: https://github.com/microsoft/onnxruntime/pull/13426 https://github.com/microsoft/onnxruntime/pull/14218 1. enabling EmbedLayerNorm op 2. enabling Sampling op 3. enabling helpers to copy data from CPU->GPU for subgraph This task is the first checkpoint. There could be other missing ops when testing a real model. We will migrate more code onto ROCm as needed. Co-authored-by: Ubuntu <ettao@ettao-amd-dev1.zvflicr54joexhdgnhvmxrxygg.phxx.internal.cloudapp.net>
This commit is contained in:
parent
a5dab850b8
commit
d632f9a3fa
12 changed files with 142 additions and 118 deletions
|
|
@ -11,10 +11,6 @@ set(contrib_ops_excluded_files
|
|||
"bert/attention_softmax.h"
|
||||
"bert/multihead_attention.cc"
|
||||
"bert/multihead_attention.h"
|
||||
"bert/embed_layer_norm.cc"
|
||||
"bert/embed_layer_norm.h"
|
||||
"bert/embed_layer_norm_impl.cu"
|
||||
"bert/embed_layer_norm_impl.h"
|
||||
"bert/fast_gelu_impl.cu"
|
||||
"bert/fast_gelu_impl.h"
|
||||
"bert/fast_gelu.cc"
|
||||
|
|
@ -85,17 +81,8 @@ set(contrib_ops_excluded_files
|
|||
"tensor/image_scaler_impl.h"
|
||||
"transformers/beam_search.cc"
|
||||
"transformers/beam_search.h"
|
||||
"transformers/generation_device_helper.cc"
|
||||
"transformers/generation_device_helper.h"
|
||||
"transformers/generation_cuda_impl.cu"
|
||||
"transformers/generation_cuda_impl.h"
|
||||
"transformers/greedy_search.cc"
|
||||
"transformers/greedy_search.h"
|
||||
"transformers/sampling.cc"
|
||||
"transformers/sampling.h"
|
||||
"transformers/sampling_cuda_helper.h"
|
||||
"transformers/dump_cuda_tensor.cc"
|
||||
"transformers/dump_cuda_tensor.h"
|
||||
"conv_transpose_with_dynamic_pads.cc"
|
||||
"conv_transpose_with_dynamic_pads.h"
|
||||
"cuda_contrib_kernels.cc"
|
||||
|
|
|
|||
|
|
@ -116,7 +116,9 @@ const IExecutionProvider* Subgraph::GetProvider() const {
|
|||
const ExecutionProviders& providers = session_state_->GetExecutionProviders();
|
||||
const IExecutionProvider* cpu_provider = providers.Get(onnxruntime::kCpuExecutionProvider);
|
||||
const IExecutionProvider* cuda_provider = providers.Get(onnxruntime::kCudaExecutionProvider);
|
||||
const IExecutionProvider* provider = cuda_provider ? cuda_provider : cpu_provider;
|
||||
const IExecutionProvider* rocm_provider = providers.Get(onnxruntime::kRocmExecutionProvider);
|
||||
const IExecutionProvider* gpu_provider = cuda_provider ? cuda_provider : rocm_provider;
|
||||
const IExecutionProvider* provider = gpu_provider ? gpu_provider : cpu_provider;
|
||||
return provider;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -17,12 +17,12 @@ class PinnedHostBuffer {
|
|||
public:
|
||||
PinnedHostBuffer(size_t length)
|
||||
: buffer_(nullptr) {
|
||||
cudaHostAlloc(&buffer_, length * sizeof(T), cudaHostAllocDefault);
|
||||
CUDA_CALL_THROW(cudaHostAlloc((void**)&buffer_, length * sizeof(T), cudaHostAllocDefault));
|
||||
}
|
||||
|
||||
virtual ~PinnedHostBuffer() {
|
||||
if (buffer_) {
|
||||
cudaFreeHost(buffer_);
|
||||
CUDA_CALL_THROW(cudaFreeHost(buffer_));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -46,8 +46,9 @@ void DumpGpuTensor(const char* name, const T* tensor, int dim0, int dim1, bool i
|
|||
// In that case, we copy tensor data as well. It is not needed, but it keeps code simple.
|
||||
int num_items = dim0 * dim1;
|
||||
auto data = std::make_shared<PinnedHostBuffer<T>>(num_items);
|
||||
cudaDeviceSynchronize();
|
||||
cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost);
|
||||
CUDA_CALL_THROW(cudaDeviceSynchronize());
|
||||
CUDA_CALL_THROW(cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost));
|
||||
|
||||
|
||||
if (nullptr != name) {
|
||||
std::cout << std::string(name) << std::endl;
|
||||
|
|
@ -64,8 +65,8 @@ template <typename T>
|
|||
void DumpGpuTensor(const char* name, const T* tensor, int dim0, int dim1, int dim2, bool is_gpu_tensor) {
|
||||
int num_items = dim0 * dim1 * dim2;
|
||||
auto data = std::make_shared<PinnedHostBuffer<T>>(num_items);
|
||||
cudaDeviceSynchronize();
|
||||
cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost);
|
||||
CUDA_CALL_THROW(cudaDeviceSynchronize());
|
||||
CUDA_CALL_THROW(cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost));
|
||||
|
||||
if (nullptr != name) {
|
||||
std::cout << std::string(name) << std::endl;
|
||||
|
|
@ -82,8 +83,8 @@ template <typename T>
|
|||
void DumpGpuTensor(const char* name, const T* tensor, int dim0, int dim1, int dim2, int dim3, bool is_gpu_tensor) {
|
||||
int num_items = dim0 * dim1 * dim2 * dim3;
|
||||
auto data = std::make_shared<PinnedHostBuffer<T>>(num_items);
|
||||
cudaDeviceSynchronize();
|
||||
cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost);
|
||||
CUDA_CALL_THROW(cudaDeviceSynchronize());
|
||||
CUDA_CALL_THROW(cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost));
|
||||
|
||||
if (nullptr != name) {
|
||||
std::cout << std::string(name) << std::endl;
|
||||
|
|
|
|||
|
|
@ -320,33 +320,33 @@ void GetTempStorageSize(const T* d_keys_in,
|
|||
bool is_descending,
|
||||
size_t& temp_storage_bytes) {
|
||||
if (is_descending) {
|
||||
cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr,
|
||||
temp_storage_bytes,
|
||||
d_keys_in,
|
||||
(T*)nullptr,
|
||||
d_values_in,
|
||||
(int*)nullptr,
|
||||
num_items,
|
||||
num_segments,
|
||||
d_offsets,
|
||||
d_offsets + 1,
|
||||
0,
|
||||
sizeof(T) * 8,
|
||||
stream);
|
||||
CUDA_CALL_THROW(cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr,
|
||||
temp_storage_bytes,
|
||||
d_keys_in,
|
||||
(T*)nullptr,
|
||||
d_values_in,
|
||||
(int*)nullptr,
|
||||
num_items,
|
||||
num_segments,
|
||||
d_offsets,
|
||||
d_offsets + 1,
|
||||
0,
|
||||
sizeof(T) * 8,
|
||||
stream));
|
||||
} else {
|
||||
cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
|
||||
temp_storage_bytes,
|
||||
d_keys_in,
|
||||
(T*)nullptr,
|
||||
d_values_in,
|
||||
(int*)nullptr,
|
||||
num_items,
|
||||
num_segments,
|
||||
d_offsets,
|
||||
d_offsets + 1,
|
||||
0,
|
||||
sizeof(T) * 8,
|
||||
stream);
|
||||
CUDA_CALL_THROW(cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
|
||||
temp_storage_bytes,
|
||||
d_keys_in,
|
||||
(T*)nullptr,
|
||||
d_values_in,
|
||||
(int*)nullptr,
|
||||
num_items,
|
||||
num_segments,
|
||||
d_offsets,
|
||||
d_offsets + 1,
|
||||
0,
|
||||
sizeof(T) * 8,
|
||||
stream));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -412,33 +412,33 @@ void LaunchSortPairs(void* d_temp_storage,
|
|||
cudaStream_t stream,
|
||||
bool is_descending) {
|
||||
if (is_descending) {
|
||||
cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage,
|
||||
temp_storage_bytes,
|
||||
d_keys_in,
|
||||
d_keys_out,
|
||||
d_values_in,
|
||||
d_values_out,
|
||||
num_items,
|
||||
num_segments,
|
||||
d_offsets,
|
||||
d_offsets + 1,
|
||||
0,
|
||||
sizeof(T) * 8,
|
||||
stream);
|
||||
CUDA_CALL_THROW(cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage,
|
||||
temp_storage_bytes,
|
||||
d_keys_in,
|
||||
d_keys_out,
|
||||
d_values_in,
|
||||
d_values_out,
|
||||
num_items,
|
||||
num_segments,
|
||||
d_offsets,
|
||||
d_offsets + 1,
|
||||
0,
|
||||
sizeof(T) * 8,
|
||||
stream));
|
||||
} else {
|
||||
cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage,
|
||||
temp_storage_bytes,
|
||||
d_keys_in,
|
||||
d_keys_out,
|
||||
d_values_in,
|
||||
d_values_out,
|
||||
num_items,
|
||||
num_segments,
|
||||
d_offsets,
|
||||
d_offsets + 1,
|
||||
0,
|
||||
sizeof(T) * 8,
|
||||
stream);
|
||||
CUDA_CALL_THROW(cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage,
|
||||
temp_storage_bytes,
|
||||
d_keys_in,
|
||||
d_keys_out,
|
||||
d_values_in,
|
||||
d_values_out,
|
||||
num_items,
|
||||
num_segments,
|
||||
d_offsets,
|
||||
d_offsets + 1,
|
||||
0,
|
||||
sizeof(T) * 8,
|
||||
stream));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -721,9 +721,9 @@ void TorchMultinomialKernelLauncher(float* d_input,
|
|||
cudaStream_t stream) {
|
||||
// Store the props in class variables
|
||||
int device;
|
||||
cudaGetDevice(&device);
|
||||
CUDA_CALL_THROW(cudaGetDevice(&device));
|
||||
cudaDeviceProp props;
|
||||
cudaGetDeviceProperties(&props, device);
|
||||
CUDA_CALL_THROW(cudaGetDeviceProperties(&props, device));
|
||||
|
||||
int numSM = props.multiProcessorCount;
|
||||
int maxThreads = props.maxThreadsPerBlock;
|
||||
|
|
|
|||
|
|
@ -17,14 +17,23 @@
|
|||
#include "contrib_ops/cpu/transformers/subgraph_gpt.h"
|
||||
#include "contrib_ops/cuda/transformers/beam_search_topk.h"
|
||||
#include "contrib_ops/cuda/transformers/greedy_search_top_one.h"
|
||||
|
||||
// the includes would be dummy for ROCm, we will ignore them for now
|
||||
#ifdef ENABLE_NVTX_PROFILE
|
||||
#include "core/providers/cuda/nvtx_profile.h"
|
||||
#include "core/providers/cuda/nvtx_profile_context.h"
|
||||
#endif
|
||||
|
||||
#include "sampling_cuda_helper.h"
|
||||
|
||||
#ifdef DEBUG_GENERATION
|
||||
#include <iostream>
|
||||
#endif
|
||||
|
||||
using onnxruntime::cuda::ToCudaType;
|
||||
using onnxruntime::cuda::TArray;
|
||||
using onnxruntime::cuda::TopKImpl;
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace concurrency {
|
||||
class ThreadPool;
|
||||
|
|
@ -203,12 +212,13 @@ void InitBeamState(transformers::IBeamSearchState<T>* beam_state,
|
|||
|
||||
// TODO(tianleiwu): we can use another stream to avoid blocking subgraph execution.
|
||||
cudaStream_t cuda_stream = ort_stream ? static_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;
|
||||
cudaMemsetAsync(beam_state->next_token_logits.data(), 0, beam_state->next_token_logits.size_bytes(), cuda_stream);
|
||||
cudaMemsetAsync(beam_state->next_token_scores.data(), 0, beam_state->next_token_scores.size_bytes(), cuda_stream);
|
||||
cudaMemsetAsync(beam_state->next_tokens.data(), 0, beam_state->next_tokens.size_bytes(), cuda_stream);
|
||||
cudaMemsetAsync(beam_state->next_indices.data(), 0, beam_state->next_indices.size_bytes(), cuda_stream);
|
||||
cudaMemsetAsync(beam_state->next_scores.data(), 0, beam_state->next_scores.size_bytes(), cuda_stream);
|
||||
cudaMemsetAsync(beam_state->topk_buffer.data(), 0, beam_state->topk_buffer.size_bytes(), cuda_stream);
|
||||
CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_token_logits.data(), 0, beam_state->next_token_logits.size_bytes(), cuda_stream));
|
||||
CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_token_scores.data(), 0, beam_state->next_token_scores.size_bytes(), cuda_stream));
|
||||
CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_tokens.data(), 0, beam_state->next_tokens.size_bytes(), cuda_stream));
|
||||
CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_indices.data(), 0, beam_state->next_indices.size_bytes(), cuda_stream));
|
||||
CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_scores.data(), 0, beam_state->next_scores.size_bytes(), cuda_stream));
|
||||
CUDA_CALL_THROW(cudaMemsetAsync(beam_state->topk_buffer.data(), 0, beam_state->topk_buffer.size_bytes(), cuda_stream));
|
||||
|
||||
|
||||
// Initialize score of first beam of each group with 0 and the rest with -1e9.
|
||||
cuda::LaunchInitKernel(beam_state->beam_scores.data(), batch_size, num_beams, cuda_stream);
|
||||
|
|
@ -216,8 +226,8 @@ void InitBeamState(transformers::IBeamSearchState<T>* beam_state,
|
|||
// copy sequence lengths to GPU
|
||||
// since next_positions is only needed to update feeds after subgraph execution, so it is fine to use Async here.
|
||||
if (!beam_state->next_positions.empty()) { // next_positions is empty for T5
|
||||
cudaMemcpyAsync(beam_state->next_positions.data(), sequence_lengths.data(), sequence_lengths.size_bytes(),
|
||||
cudaMemcpyHostToDevice, cuda_stream);
|
||||
CUDA_CALL_THROW(cudaMemcpyAsync(beam_state->next_positions.data(), sequence_lengths.data(), sequence_lengths.size_bytes(),
|
||||
cudaMemcpyHostToDevice, cuda_stream));
|
||||
}
|
||||
|
||||
#ifdef ENABLE_NVTX_PROFILE
|
||||
|
|
@ -234,12 +244,12 @@ void InitGreedyState(transformers::IGreedySearchState<T>* greedy_state,
|
|||
initStateRange.Begin();
|
||||
#endif
|
||||
|
||||
cudaStream_t cuda_stream = ort_stream ? reinterpret_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;
|
||||
cudaMemsetAsync(greedy_state->next_token_scores.data(), 0, greedy_state->next_token_scores.size_bytes(), cuda_stream);
|
||||
cudaMemsetAsync(greedy_state->next_positions.data(), 0, greedy_state->next_positions.size_bytes(), cuda_stream);
|
||||
cudaStream_t cuda_stream = ort_stream ? reinterpret_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;
|
||||
CUDA_CALL_THROW(cudaMemsetAsync(greedy_state->next_token_scores.data(), 0, greedy_state->next_token_scores.size_bytes(), cuda_stream));
|
||||
CUDA_CALL_THROW(cudaMemsetAsync(greedy_state->next_positions.data(), 0, greedy_state->next_positions.size_bytes(), cuda_stream));
|
||||
|
||||
cudaMemcpyAsync(greedy_state->next_positions.data(), sequence_lengths.data(), sequence_lengths.size_bytes(),
|
||||
cudaMemcpyHostToDevice, cuda_stream);
|
||||
CUDA_CALL_THROW(cudaMemcpyAsync(greedy_state->next_positions.data(), sequence_lengths.data(), sequence_lengths.size_bytes(),
|
||||
cudaMemcpyHostToDevice, cuda_stream));
|
||||
|
||||
#ifdef ENABLE_NVTX_PROFILE
|
||||
initStateRange.End();
|
||||
|
|
|
|||
|
|
@ -11,6 +11,9 @@
|
|||
#include <iostream>
|
||||
#endif
|
||||
|
||||
using onnxruntime::cuda::ToCudaType;
|
||||
using onnxruntime::cuda::dispatch_blockwise_softmax_forward;
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace contrib {
|
||||
namespace SamplingCudaHelper {
|
||||
|
|
|
|||
|
|
@ -69,6 +69,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1
|
|||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ParametricSoftplus);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ParametricSoftplus);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ParametricSoftplus);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Sampling);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ScaledTanh);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ScaledTanh);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ScaledTanh);
|
||||
|
|
@ -166,8 +167,8 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
|
|||
1, MLFloat16, DecoderAttention)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, int32_t, DynamicSlice)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, int64_t, DynamicSlice)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, EmbedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, EmbedLayerNormalization)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ImageScaler)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ImageScaler)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ImageScaler)>,
|
||||
|
|
@ -178,6 +179,7 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
|
|||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ParametricSoftplus)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ParametricSoftplus)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ParametricSoftplus)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Sampling)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ScaledTanh)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ScaledTanh)>,
|
||||
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ScaledTanh)>,
|
||||
|
|
|
|||
|
|
@ -1004,7 +1004,7 @@ Status SessionState::CreateSubgraphSessionState() {
|
|||
for (auto& node : graph_.Nodes()) {
|
||||
for (auto& entry : node.GetAttributeNameToMutableSubgraphMap()) {
|
||||
const auto& ep = node.GetExecutionProviderType();
|
||||
if (!ep.empty() && ep != kCpuExecutionProvider && ep != kCudaExecutionProvider) {
|
||||
if (!ep.empty() && ep != kCpuExecutionProvider && ep != kCudaExecutionProvider && ep != kRocmExecutionProvider) {
|
||||
// SessionState is only used when ORT is executing the subgraph. If a non-ORT EP has taken the control flow
|
||||
// node containing the subgraph it will create whatever state it needs internally.
|
||||
continue;
|
||||
|
|
|
|||
|
|
@ -131,6 +131,9 @@ SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(half, half, float)
|
|||
SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(double, double, double)
|
||||
SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(BFloat16, BFloat16, float)
|
||||
|
||||
#ifndef DISABLE_CONTRIB_OPS
|
||||
SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(half, float, float) // used by BeamSearch op
|
||||
#endif
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,10 +17,11 @@ static void RunTest(const embedlayernorm::OpData& data,
|
|||
int min_cuda_architecture = use_float16 ? 530 : 0;
|
||||
|
||||
bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
|
||||
bool enable_rocm = DefaultRocmExecutionProvider().get() != nullptr;
|
||||
bool enable_dml = DefaultDmlExecutionProvider().get() != nullptr;
|
||||
bool enable_cpu = !use_float16;
|
||||
|
||||
if (enable_cpu || enable_cuda || enable_dml) {
|
||||
if (enable_cpu || enable_cuda || enable_dml || enable_rocm) {
|
||||
// Input and output shapes
|
||||
// Input 0 - input_ids : (batch_size, sequence_size)
|
||||
// Input 1 - segment_ids : (batch_size, sequence_size)
|
||||
|
|
@ -143,6 +144,10 @@ static void RunTest(const embedlayernorm::OpData& data,
|
|||
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
|
||||
execution_providers.push_back(DefaultCudaExecutionProvider());
|
||||
tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
|
||||
} else if (enable_rocm) {
|
||||
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
|
||||
execution_providers.push_back(DefaultRocmExecutionProvider());
|
||||
tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
|
||||
} else if (enable_dml) {
|
||||
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
|
||||
execution_providers.push_back(DefaultDmlExecutionProvider());
|
||||
|
|
|
|||
|
|
@ -14,8 +14,8 @@ namespace onnxruntime {
|
|||
namespace test {
|
||||
|
||||
#if defined(__linux__) && !defined(__ANDROID__)
|
||||
#ifdef USE_CUDA
|
||||
TEST(SamplingTest, Gpt2Sampling_CUDA) {
|
||||
#if defined(USE_CUDA) || defined(USE_ROCM)
|
||||
TEST(SamplingTest, Gpt2Sampling_GPU) {
|
||||
std::vector<int32_t> input_ids{
|
||||
0, 0, 0, 0, 0, 52, 195, 731, 321, 301, 734, 620,
|
||||
41, 554, 74, 622, 206, 222, 75, 223, 221, 198, 224, 572,
|
||||
|
|
@ -25,7 +25,6 @@ TEST(SamplingTest, Gpt2Sampling_CUDA) {
|
|||
std::vector<int32_t> min_length{1};
|
||||
std::vector<float> repetition_penalty{1.0f};
|
||||
|
||||
|
||||
std::vector<int32_t> expected_output{
|
||||
0, 0, 0, 0, 0, 52, 195, 731, 321, 301, 734, 620, 125, 543, 668,
|
||||
41, 554, 74, 622, 206, 222, 75, 223, 221, 198, 224, 572, 776, 213, 697,
|
||||
|
|
@ -35,9 +34,7 @@ TEST(SamplingTest, Gpt2Sampling_CUDA) {
|
|||
const int64_t sequence_length = 12;
|
||||
|
||||
std::vector<int64_t> input_ids_shape{batch_size, sequence_length};
|
||||
|
||||
std::vector<int64_t> parameter_shape{1};
|
||||
|
||||
std::vector<int64_t> expected_output_shape{input_ids_shape[0], max_length[0]};
|
||||
|
||||
Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
|
||||
|
|
@ -62,28 +59,36 @@ TEST(SamplingTest, Gpt2Sampling_CUDA) {
|
|||
const char* const output_names[] = {"sequences"};
|
||||
|
||||
Ort::SessionOptions session_options;
|
||||
#ifdef USE_CUDA
|
||||
constexpr int min_cuda_architecture = 530;
|
||||
if (HasCudaEnvironment(min_cuda_architecture)) {
|
||||
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
|
||||
|
||||
Ort::Session session(*ort_env, ORT_TSTR("testdata/transformers/tiny_gpt2_sampling.onnx"), session_options);
|
||||
|
||||
auto ort_outputs = session.Run(Ort::RunOptions{}, input_names, ort_inputs.data(), ort_inputs.size(),
|
||||
output_names, 1);
|
||||
|
||||
ASSERT_EQ(ort_outputs.size(), 1U);
|
||||
const auto& sequences = ort_outputs[0];
|
||||
ASSERT_TRUE(sequences.IsTensor());
|
||||
|
||||
auto result_ts = sequences.GetTensorTypeAndShapeInfo();
|
||||
ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, result_ts.GetElementType());
|
||||
|
||||
ASSERT_EQ(expected_output_shape, result_ts.GetShape());
|
||||
const auto* result_vals = sequences.GetTensorData<int32_t>();
|
||||
auto result_span = gsl::make_span(result_vals, expected_output.size());
|
||||
|
||||
ASSERT_TRUE(std::equal(expected_output.cbegin(), expected_output.cend(), result_span.begin(), result_span.end()));
|
||||
if (!HasCudaEnvironment(min_cuda_architecture)) {
|
||||
LOGS_DEFAULT(WARNING) << "Hardware NOT support current architecture";
|
||||
return;
|
||||
}
|
||||
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
|
||||
#else // USE_ROCM
|
||||
OrtROCMProviderOptions rocm_options;
|
||||
// TODO - verify the default settings
|
||||
session_options.AppendExecutionProvider_ROCM(rocm_options);
|
||||
#endif
|
||||
|
||||
Ort::Session session(*ort_env, ORT_TSTR("testdata/transformers/tiny_gpt2_sampling.onnx"), session_options);
|
||||
|
||||
auto ort_outputs = session.Run(Ort::RunOptions{}, input_names, ort_inputs.data(), ort_inputs.size(),
|
||||
output_names, 1);
|
||||
|
||||
ASSERT_EQ(ort_outputs.size(), 1U);
|
||||
const auto& sequences = ort_outputs[0];
|
||||
ASSERT_TRUE(sequences.IsTensor());
|
||||
|
||||
auto result_ts = sequences.GetTensorTypeAndShapeInfo();
|
||||
ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, result_ts.GetElementType());
|
||||
|
||||
ASSERT_EQ(expected_output_shape, result_ts.GetShape());
|
||||
const auto* result_vals = sequences.GetTensorData<int32_t>();
|
||||
auto result_span = gsl::make_span(result_vals, expected_output.size());
|
||||
|
||||
ASSERT_TRUE(std::equal(expected_output.cbegin(), expected_output.cend(), result_span.begin(), result_span.end()));
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -59,6 +59,8 @@ def hipify(hipify_perl_path, src_file_path, dst_file_path):
|
|||
s = s.replace("GPU_WARP_SIZE = 32", "GPU_WARP_SIZE = 64")
|
||||
s = s.replace("std::exp", "expf")
|
||||
s = s.replace("std::log", "logf")
|
||||
s = s.replace("WaitCudaNotificationOnDevice", "WaitRocmNotificationOnDevice")
|
||||
s = s.replace("hipHostAlloc", "hipHostMalloc")
|
||||
s = s.replace(
|
||||
"#include <cub/device/device_radix_sort.cuh>",
|
||||
"#include <hipcub/hipcub.hpp>\n#include <hipcub/backend/rocprim/device/device_radix_sort.hpp>",
|
||||
|
|
@ -67,6 +69,10 @@ def hipify(hipify_perl_path, src_file_path, dst_file_path):
|
|||
'#include "cub/device/device_radix_sort.cuh"',
|
||||
"#include <hipcub/hipcub.hpp>\n#include <hipcub/backend/rocprim/device/device_radix_sort.hpp>",
|
||||
)
|
||||
s = s.replace(
|
||||
"#include <cub/device/device_segmented_radix_sort.cuh>",
|
||||
"#include <hipcub/backend/rocprim/device/device_segmented_radix_sort.hpp>",
|
||||
)
|
||||
s = s.replace(
|
||||
"#include <cub/device/device_reduce.cuh>", "#include <hipcub/backend/rocprim/device/device_reduce.hpp>"
|
||||
)
|
||||
|
|
|
|||
Loading…
Reference in a new issue