[ROCm] Enable Sampling Op UT on AMD (#14581)

Making basic porting effort to run Sampling UT on ROCm ep, based on the
commits:

https://github.com/microsoft/onnxruntime/pull/13426
https://github.com/microsoft/onnxruntime/pull/14218

1. enabling EmbedLayerNorm op
2. enabling Sampling op
3. enabling helpers to copy data from CPU->GPU for subgraph

This task is the first checkpoint. There could be other missing ops when
testing a real model.
We will migrate more code onto ROCm as needed.

Co-authored-by: Ubuntu <ettao@ettao-amd-dev1.zvflicr54joexhdgnhvmxrxygg.phxx.internal.cloudapp.net>
This commit is contained in:
ytaous 2023-02-06 20:52:06 -08:00 committed by GitHub
parent a5dab850b8
commit d632f9a3fa
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 142 additions and 118 deletions

View file

@ -11,10 +11,6 @@ set(contrib_ops_excluded_files
"bert/attention_softmax.h"
"bert/multihead_attention.cc"
"bert/multihead_attention.h"
"bert/embed_layer_norm.cc"
"bert/embed_layer_norm.h"
"bert/embed_layer_norm_impl.cu"
"bert/embed_layer_norm_impl.h"
"bert/fast_gelu_impl.cu"
"bert/fast_gelu_impl.h"
"bert/fast_gelu.cc"
@ -85,17 +81,8 @@ set(contrib_ops_excluded_files
"tensor/image_scaler_impl.h"
"transformers/beam_search.cc"
"transformers/beam_search.h"
"transformers/generation_device_helper.cc"
"transformers/generation_device_helper.h"
"transformers/generation_cuda_impl.cu"
"transformers/generation_cuda_impl.h"
"transformers/greedy_search.cc"
"transformers/greedy_search.h"
"transformers/sampling.cc"
"transformers/sampling.h"
"transformers/sampling_cuda_helper.h"
"transformers/dump_cuda_tensor.cc"
"transformers/dump_cuda_tensor.h"
"conv_transpose_with_dynamic_pads.cc"
"conv_transpose_with_dynamic_pads.h"
"cuda_contrib_kernels.cc"

View file

@ -116,7 +116,9 @@ const IExecutionProvider* Subgraph::GetProvider() const {
const ExecutionProviders& providers = session_state_->GetExecutionProviders();
const IExecutionProvider* cpu_provider = providers.Get(onnxruntime::kCpuExecutionProvider);
const IExecutionProvider* cuda_provider = providers.Get(onnxruntime::kCudaExecutionProvider);
const IExecutionProvider* provider = cuda_provider ? cuda_provider : cpu_provider;
const IExecutionProvider* rocm_provider = providers.Get(onnxruntime::kRocmExecutionProvider);
const IExecutionProvider* gpu_provider = cuda_provider ? cuda_provider : rocm_provider;
const IExecutionProvider* provider = gpu_provider ? gpu_provider : cpu_provider;
return provider;
}

View file

@ -17,12 +17,12 @@ class PinnedHostBuffer {
public:
PinnedHostBuffer(size_t length)
: buffer_(nullptr) {
cudaHostAlloc(&buffer_, length * sizeof(T), cudaHostAllocDefault);
CUDA_CALL_THROW(cudaHostAlloc((void**)&buffer_, length * sizeof(T), cudaHostAllocDefault));
}
virtual ~PinnedHostBuffer() {
if (buffer_) {
cudaFreeHost(buffer_);
CUDA_CALL_THROW(cudaFreeHost(buffer_));
}
}
@ -46,8 +46,9 @@ void DumpGpuTensor(const char* name, const T* tensor, int dim0, int dim1, bool i
// In that case, we copy tensor data as well. It is not needed, but it keeps code simple.
int num_items = dim0 * dim1;
auto data = std::make_shared<PinnedHostBuffer<T>>(num_items);
cudaDeviceSynchronize();
cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost);
CUDA_CALL_THROW(cudaDeviceSynchronize());
CUDA_CALL_THROW(cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost));
if (nullptr != name) {
std::cout << std::string(name) << std::endl;
@ -64,8 +65,8 @@ template <typename T>
void DumpGpuTensor(const char* name, const T* tensor, int dim0, int dim1, int dim2, bool is_gpu_tensor) {
int num_items = dim0 * dim1 * dim2;
auto data = std::make_shared<PinnedHostBuffer<T>>(num_items);
cudaDeviceSynchronize();
cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost);
CUDA_CALL_THROW(cudaDeviceSynchronize());
CUDA_CALL_THROW(cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost));
if (nullptr != name) {
std::cout << std::string(name) << std::endl;
@ -82,8 +83,8 @@ template <typename T>
void DumpGpuTensor(const char* name, const T* tensor, int dim0, int dim1, int dim2, int dim3, bool is_gpu_tensor) {
int num_items = dim0 * dim1 * dim2 * dim3;
auto data = std::make_shared<PinnedHostBuffer<T>>(num_items);
cudaDeviceSynchronize();
cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost);
CUDA_CALL_THROW(cudaDeviceSynchronize());
CUDA_CALL_THROW(cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost));
if (nullptr != name) {
std::cout << std::string(name) << std::endl;

View file

@ -320,33 +320,33 @@ void GetTempStorageSize(const T* d_keys_in,
bool is_descending,
size_t& temp_storage_bytes) {
if (is_descending) {
cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr,
temp_storage_bytes,
d_keys_in,
(T*)nullptr,
d_values_in,
(int*)nullptr,
num_items,
num_segments,
d_offsets,
d_offsets + 1,
0,
sizeof(T) * 8,
stream);
CUDA_CALL_THROW(cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr,
temp_storage_bytes,
d_keys_in,
(T*)nullptr,
d_values_in,
(int*)nullptr,
num_items,
num_segments,
d_offsets,
d_offsets + 1,
0,
sizeof(T) * 8,
stream));
} else {
cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
temp_storage_bytes,
d_keys_in,
(T*)nullptr,
d_values_in,
(int*)nullptr,
num_items,
num_segments,
d_offsets,
d_offsets + 1,
0,
sizeof(T) * 8,
stream);
CUDA_CALL_THROW(cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
temp_storage_bytes,
d_keys_in,
(T*)nullptr,
d_values_in,
(int*)nullptr,
num_items,
num_segments,
d_offsets,
d_offsets + 1,
0,
sizeof(T) * 8,
stream));
}
}
@ -412,33 +412,33 @@ void LaunchSortPairs(void* d_temp_storage,
cudaStream_t stream,
bool is_descending) {
if (is_descending) {
cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_keys_out,
d_values_in,
d_values_out,
num_items,
num_segments,
d_offsets,
d_offsets + 1,
0,
sizeof(T) * 8,
stream);
CUDA_CALL_THROW(cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_keys_out,
d_values_in,
d_values_out,
num_items,
num_segments,
d_offsets,
d_offsets + 1,
0,
sizeof(T) * 8,
stream));
} else {
cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_keys_out,
d_values_in,
d_values_out,
num_items,
num_segments,
d_offsets,
d_offsets + 1,
0,
sizeof(T) * 8,
stream);
CUDA_CALL_THROW(cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_keys_out,
d_values_in,
d_values_out,
num_items,
num_segments,
d_offsets,
d_offsets + 1,
0,
sizeof(T) * 8,
stream));
}
}
@ -721,9 +721,9 @@ void TorchMultinomialKernelLauncher(float* d_input,
cudaStream_t stream) {
// Store the props in class variables
int device;
cudaGetDevice(&device);
CUDA_CALL_THROW(cudaGetDevice(&device));
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device);
CUDA_CALL_THROW(cudaGetDeviceProperties(&props, device));
int numSM = props.multiProcessorCount;
int maxThreads = props.maxThreadsPerBlock;

View file

@ -17,14 +17,23 @@
#include "contrib_ops/cpu/transformers/subgraph_gpt.h"
#include "contrib_ops/cuda/transformers/beam_search_topk.h"
#include "contrib_ops/cuda/transformers/greedy_search_top_one.h"
// the includes would be dummy for ROCm, we will ignore them for now
#ifdef ENABLE_NVTX_PROFILE
#include "core/providers/cuda/nvtx_profile.h"
#include "core/providers/cuda/nvtx_profile_context.h"
#endif
#include "sampling_cuda_helper.h"
#ifdef DEBUG_GENERATION
#include <iostream>
#endif
using onnxruntime::cuda::ToCudaType;
using onnxruntime::cuda::TArray;
using onnxruntime::cuda::TopKImpl;
namespace onnxruntime {
namespace concurrency {
class ThreadPool;
@ -203,12 +212,13 @@ void InitBeamState(transformers::IBeamSearchState<T>* beam_state,
// TODO(tianleiwu): we can use another stream to avoid blocking subgraph execution.
cudaStream_t cuda_stream = ort_stream ? static_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;
cudaMemsetAsync(beam_state->next_token_logits.data(), 0, beam_state->next_token_logits.size_bytes(), cuda_stream);
cudaMemsetAsync(beam_state->next_token_scores.data(), 0, beam_state->next_token_scores.size_bytes(), cuda_stream);
cudaMemsetAsync(beam_state->next_tokens.data(), 0, beam_state->next_tokens.size_bytes(), cuda_stream);
cudaMemsetAsync(beam_state->next_indices.data(), 0, beam_state->next_indices.size_bytes(), cuda_stream);
cudaMemsetAsync(beam_state->next_scores.data(), 0, beam_state->next_scores.size_bytes(), cuda_stream);
cudaMemsetAsync(beam_state->topk_buffer.data(), 0, beam_state->topk_buffer.size_bytes(), cuda_stream);
CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_token_logits.data(), 0, beam_state->next_token_logits.size_bytes(), cuda_stream));
CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_token_scores.data(), 0, beam_state->next_token_scores.size_bytes(), cuda_stream));
CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_tokens.data(), 0, beam_state->next_tokens.size_bytes(), cuda_stream));
CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_indices.data(), 0, beam_state->next_indices.size_bytes(), cuda_stream));
CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_scores.data(), 0, beam_state->next_scores.size_bytes(), cuda_stream));
CUDA_CALL_THROW(cudaMemsetAsync(beam_state->topk_buffer.data(), 0, beam_state->topk_buffer.size_bytes(), cuda_stream));
// Initialize score of first beam of each group with 0 and the rest with -1e9.
cuda::LaunchInitKernel(beam_state->beam_scores.data(), batch_size, num_beams, cuda_stream);
@ -216,8 +226,8 @@ void InitBeamState(transformers::IBeamSearchState<T>* beam_state,
// copy sequence lengths to GPU
// since next_positions is only needed to update feeds after subgraph execution, so it is fine to use Async here.
if (!beam_state->next_positions.empty()) { // next_positions is empty for T5
cudaMemcpyAsync(beam_state->next_positions.data(), sequence_lengths.data(), sequence_lengths.size_bytes(),
cudaMemcpyHostToDevice, cuda_stream);
CUDA_CALL_THROW(cudaMemcpyAsync(beam_state->next_positions.data(), sequence_lengths.data(), sequence_lengths.size_bytes(),
cudaMemcpyHostToDevice, cuda_stream));
}
#ifdef ENABLE_NVTX_PROFILE
@ -234,12 +244,12 @@ void InitGreedyState(transformers::IGreedySearchState<T>* greedy_state,
initStateRange.Begin();
#endif
cudaStream_t cuda_stream = ort_stream ? reinterpret_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;
cudaMemsetAsync(greedy_state->next_token_scores.data(), 0, greedy_state->next_token_scores.size_bytes(), cuda_stream);
cudaMemsetAsync(greedy_state->next_positions.data(), 0, greedy_state->next_positions.size_bytes(), cuda_stream);
cudaStream_t cuda_stream = ort_stream ? reinterpret_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;
CUDA_CALL_THROW(cudaMemsetAsync(greedy_state->next_token_scores.data(), 0, greedy_state->next_token_scores.size_bytes(), cuda_stream));
CUDA_CALL_THROW(cudaMemsetAsync(greedy_state->next_positions.data(), 0, greedy_state->next_positions.size_bytes(), cuda_stream));
cudaMemcpyAsync(greedy_state->next_positions.data(), sequence_lengths.data(), sequence_lengths.size_bytes(),
cudaMemcpyHostToDevice, cuda_stream);
CUDA_CALL_THROW(cudaMemcpyAsync(greedy_state->next_positions.data(), sequence_lengths.data(), sequence_lengths.size_bytes(),
cudaMemcpyHostToDevice, cuda_stream));
#ifdef ENABLE_NVTX_PROFILE
initStateRange.End();

View file

@ -11,6 +11,9 @@
#include <iostream>
#endif
using onnxruntime::cuda::ToCudaType;
using onnxruntime::cuda::dispatch_blockwise_softmax_forward;
namespace onnxruntime {
namespace contrib {
namespace SamplingCudaHelper {

View file

@ -69,6 +69,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ParametricSoftplus);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ParametricSoftplus);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ParametricSoftplus);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Sampling);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ScaledTanh);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ScaledTanh);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ScaledTanh);
@ -166,8 +167,8 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
1, MLFloat16, DecoderAttention)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, int32_t, DynamicSlice)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, int64_t, DynamicSlice)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, EmbedLayerNormalization)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, EmbedLayerNormalization)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ImageScaler)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ImageScaler)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ImageScaler)>,
@ -178,6 +179,7 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ParametricSoftplus)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ParametricSoftplus)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ParametricSoftplus)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Sampling)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ScaledTanh)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ScaledTanh)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ScaledTanh)>,

View file

@ -1004,7 +1004,7 @@ Status SessionState::CreateSubgraphSessionState() {
for (auto& node : graph_.Nodes()) {
for (auto& entry : node.GetAttributeNameToMutableSubgraphMap()) {
const auto& ep = node.GetExecutionProviderType();
if (!ep.empty() && ep != kCpuExecutionProvider && ep != kCudaExecutionProvider) {
if (!ep.empty() && ep != kCpuExecutionProvider && ep != kCudaExecutionProvider && ep != kRocmExecutionProvider) {
// SessionState is only used when ORT is executing the subgraph. If a non-ORT EP has taken the control flow
// node containing the subgraph it will create whatever state it needs internally.
continue;

View file

@ -131,6 +131,9 @@ SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(half, half, float)
SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(double, double, double)
SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(BFloat16, BFloat16, float)
#ifndef DISABLE_CONTRIB_OPS
SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(half, float, float) // used by BeamSearch op
#endif
}
}

View file

@ -17,10 +17,11 @@ static void RunTest(const embedlayernorm::OpData& data,
int min_cuda_architecture = use_float16 ? 530 : 0;
bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
bool enable_rocm = DefaultRocmExecutionProvider().get() != nullptr;
bool enable_dml = DefaultDmlExecutionProvider().get() != nullptr;
bool enable_cpu = !use_float16;
if (enable_cpu || enable_cuda || enable_dml) {
if (enable_cpu || enable_cuda || enable_dml || enable_rocm) {
// Input and output shapes
// Input 0 - input_ids : (batch_size, sequence_size)
// Input 1 - segment_ids : (batch_size, sequence_size)
@ -143,6 +144,10 @@ static void RunTest(const embedlayernorm::OpData& data,
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
execution_providers.push_back(DefaultCudaExecutionProvider());
tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
} else if (enable_rocm) {
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
execution_providers.push_back(DefaultRocmExecutionProvider());
tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
} else if (enable_dml) {
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
execution_providers.push_back(DefaultDmlExecutionProvider());

View file

@ -14,8 +14,8 @@ namespace onnxruntime {
namespace test {
#if defined(__linux__) && !defined(__ANDROID__)
#ifdef USE_CUDA
TEST(SamplingTest, Gpt2Sampling_CUDA) {
#if defined(USE_CUDA) || defined(USE_ROCM)
TEST(SamplingTest, Gpt2Sampling_GPU) {
std::vector<int32_t> input_ids{
0, 0, 0, 0, 0, 52, 195, 731, 321, 301, 734, 620,
41, 554, 74, 622, 206, 222, 75, 223, 221, 198, 224, 572,
@ -25,7 +25,6 @@ TEST(SamplingTest, Gpt2Sampling_CUDA) {
std::vector<int32_t> min_length{1};
std::vector<float> repetition_penalty{1.0f};
std::vector<int32_t> expected_output{
0, 0, 0, 0, 0, 52, 195, 731, 321, 301, 734, 620, 125, 543, 668,
41, 554, 74, 622, 206, 222, 75, 223, 221, 198, 224, 572, 776, 213, 697,
@ -35,9 +34,7 @@ TEST(SamplingTest, Gpt2Sampling_CUDA) {
const int64_t sequence_length = 12;
std::vector<int64_t> input_ids_shape{batch_size, sequence_length};
std::vector<int64_t> parameter_shape{1};
std::vector<int64_t> expected_output_shape{input_ids_shape[0], max_length[0]};
Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
@ -62,28 +59,36 @@ TEST(SamplingTest, Gpt2Sampling_CUDA) {
const char* const output_names[] = {"sequences"};
Ort::SessionOptions session_options;
#ifdef USE_CUDA
constexpr int min_cuda_architecture = 530;
if (HasCudaEnvironment(min_cuda_architecture)) {
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
Ort::Session session(*ort_env, ORT_TSTR("testdata/transformers/tiny_gpt2_sampling.onnx"), session_options);
auto ort_outputs = session.Run(Ort::RunOptions{}, input_names, ort_inputs.data(), ort_inputs.size(),
output_names, 1);
ASSERT_EQ(ort_outputs.size(), 1U);
const auto& sequences = ort_outputs[0];
ASSERT_TRUE(sequences.IsTensor());
auto result_ts = sequences.GetTensorTypeAndShapeInfo();
ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, result_ts.GetElementType());
ASSERT_EQ(expected_output_shape, result_ts.GetShape());
const auto* result_vals = sequences.GetTensorData<int32_t>();
auto result_span = gsl::make_span(result_vals, expected_output.size());
ASSERT_TRUE(std::equal(expected_output.cbegin(), expected_output.cend(), result_span.begin(), result_span.end()));
if (!HasCudaEnvironment(min_cuda_architecture)) {
LOGS_DEFAULT(WARNING) << "Hardware NOT support current architecture";
return;
}
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
#else // USE_ROCM
OrtROCMProviderOptions rocm_options;
// TODO - verify the default settings
session_options.AppendExecutionProvider_ROCM(rocm_options);
#endif
Ort::Session session(*ort_env, ORT_TSTR("testdata/transformers/tiny_gpt2_sampling.onnx"), session_options);
auto ort_outputs = session.Run(Ort::RunOptions{}, input_names, ort_inputs.data(), ort_inputs.size(),
output_names, 1);
ASSERT_EQ(ort_outputs.size(), 1U);
const auto& sequences = ort_outputs[0];
ASSERT_TRUE(sequences.IsTensor());
auto result_ts = sequences.GetTensorTypeAndShapeInfo();
ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, result_ts.GetElementType());
ASSERT_EQ(expected_output_shape, result_ts.GetShape());
const auto* result_vals = sequences.GetTensorData<int32_t>();
auto result_span = gsl::make_span(result_vals, expected_output.size());
ASSERT_TRUE(std::equal(expected_output.cbegin(), expected_output.cend(), result_span.begin(), result_span.end()));
}
#endif

View file

@ -59,6 +59,8 @@ def hipify(hipify_perl_path, src_file_path, dst_file_path):
s = s.replace("GPU_WARP_SIZE = 32", "GPU_WARP_SIZE = 64")
s = s.replace("std::exp", "expf")
s = s.replace("std::log", "logf")
s = s.replace("WaitCudaNotificationOnDevice", "WaitRocmNotificationOnDevice")
s = s.replace("hipHostAlloc", "hipHostMalloc")
s = s.replace(
"#include <cub/device/device_radix_sort.cuh>",
"#include <hipcub/hipcub.hpp>\n#include <hipcub/backend/rocprim/device/device_radix_sort.hpp>",
@ -67,6 +69,10 @@ def hipify(hipify_perl_path, src_file_path, dst_file_path):
'#include "cub/device/device_radix_sort.cuh"',
"#include <hipcub/hipcub.hpp>\n#include <hipcub/backend/rocprim/device/device_radix_sort.hpp>",
)
s = s.replace(
"#include <cub/device/device_segmented_radix_sort.cuh>",
"#include <hipcub/backend/rocprim/device/device_segmented_radix_sort.hpp>",
)
s = s.replace(
"#include <cub/device/device_reduce.cuh>", "#include <hipcub/backend/rocprim/device/device_reduce.hpp>"
)