mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-07 00:13:17 +00:00
[MigraphX EP] [ROCm EP] Upstream ROCm changes for bugfixes and features (#23249)
Add support to mainline Onnxruntime of changes from the ROCm Team's changes ### Motivation and Context Various bugfixes, and changes added between ROCm 6.2 and 6.3 that haven't been upstreamed yet to mainline --------- Co-authored-by: Yueqing Zhang <yuz75@Pitt.edu> Co-authored-by: Yueqing Zhang <yueqingz@amd.com> Co-authored-by: Jeff Daily <jeff.daily@amd.com> Co-authored-by: Artur Wojcik <artur.wojcik@outlook.com> Co-authored-by: Ted Themistokleous <tedthemistokleous@amd.com> Co-authored-by: Xinya Zhang <Xinya.Zhang@amd.com> Co-authored-by: ikalinic <ilija.kalinic@amd.com> Co-authored-by: sstamenk <sstamenk@amd.com>
This commit is contained in:
parent
e2a488b4f1
commit
23c0282a22
14 changed files with 13322 additions and 12 deletions
|
|
@ -371,6 +371,70 @@ if (onnxruntime_USE_ROCM)
|
|||
set(onnxruntime_HIPIFY_PERL ${HIPIFY_PERL_PATH}/hipify-perl)
|
||||
endif()
|
||||
|
||||
# replicate strategy used by pytorch to get ROCM_VERSION
|
||||
# https://github.com/pytorch/pytorch/blob/1a10751731784942dcbb9c0524c1369a29d45244/cmake/public/LoadHIP.cmake#L45-L109
|
||||
# with modification
|
||||
set(ROCM_INCLUDE_DIRS "${onnxruntime_ROCM_HOME}/include")
|
||||
set(PROJECT_RANDOM_BINARY_DIR "${CMAKE_BINARY_DIR}")
|
||||
set(file "${CMAKE_BINARY_DIR}/detect_rocm_version.cc")
|
||||
|
||||
# Find ROCM version for checks
|
||||
# ROCM 5.0 and later will have header api for version management
|
||||
if(EXISTS ${ROCM_INCLUDE_DIRS}/rocm_version.h)
|
||||
file(WRITE ${file} ""
|
||||
"#include <rocm_version.h>\n"
|
||||
)
|
||||
elseif(EXISTS ${ROCM_INCLUDE_DIRS}/rocm-core/rocm_version.h)
|
||||
file(WRITE ${file} ""
|
||||
"#include <rocm-core/rocm_version.h>\n"
|
||||
)
|
||||
else()
|
||||
message(FATAL_ERROR "********************* rocm_version.h couldnt be found ******************\n")
|
||||
endif()
|
||||
|
||||
file(APPEND ${file} ""
|
||||
"#include <cstdio>\n"
|
||||
|
||||
"#ifndef ROCM_VERSION_PATCH\n"
|
||||
"#define ROCM_VERSION_PATCH 0\n"
|
||||
"#endif\n"
|
||||
"#define STRINGIFYHELPER(x) #x\n"
|
||||
"#define STRINGIFY(x) STRINGIFYHELPER(x)\n"
|
||||
"int main() {\n"
|
||||
" printf(\"%d.%d.%s\", ROCM_VERSION_MAJOR, ROCM_VERSION_MINOR, STRINGIFY(ROCM_VERSION_PATCH));\n"
|
||||
" return 0;\n"
|
||||
"}\n"
|
||||
)
|
||||
|
||||
try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
|
||||
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}"
|
||||
RUN_OUTPUT_VARIABLE rocm_version_from_header
|
||||
COMPILE_OUTPUT_VARIABLE output_var
|
||||
)
|
||||
# We expect the compile to be successful if the include directory exists.
|
||||
if(NOT compile_result)
|
||||
message(FATAL_ERROR "ROCM: Couldn't determine version from header: " ${output_var})
|
||||
endif()
|
||||
message(STATUS "ROCM: Header version is: " ${rocm_version_from_header})
|
||||
set(ROCM_VERSION_DEV_RAW ${rocm_version_from_header})
|
||||
|
||||
string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+).*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW})
|
||||
|
||||
if (ROCM_VERSION_DEV_MATCH)
|
||||
set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1})
|
||||
set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
|
||||
set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3})
|
||||
set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
|
||||
math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")
|
||||
else()
|
||||
message(FATAL_ERROR "Cannot determine ROCm version string")
|
||||
endif()
|
||||
message("\n***** ROCm version from rocm_version.h ****\n")
|
||||
message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
|
||||
message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
|
||||
message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
|
||||
message("ROCM_VERSION_DEV_PATCH: ${ROCM_VERSION_DEV_PATCH}")
|
||||
message("ROCM_VERSION_DEV_INT: ${ROCM_VERSION_DEV_INT}")
|
||||
message("\n***** HIP LANGUAGE CONFIG INFO ****\n")
|
||||
message("CMAKE_HIP_COMPILER: ${CMAKE_HIP_COMPILER}")
|
||||
message("CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
|
||||
|
|
@ -1143,6 +1207,8 @@ function(onnxruntime_set_compile_flags target_name)
|
|||
# because we may mix gcc and hipclang
|
||||
set(ORT_HIP_WARNING_FLAGS ${ORT_WARNING_FLAGS})
|
||||
list(REMOVE_ITEM ORT_HIP_WARNING_FLAGS -Wno-nonnull-compare)
|
||||
# Unsupported by Clang 18 yet.
|
||||
list(REMOVE_ITEM ORT_HIP_WARNING_FLAGS -Wno-dangling-reference)
|
||||
|
||||
# float16.h:90:12: error: ‘tmp’ is used uninitialized
|
||||
list(APPEND ORT_HIP_WARNING_FLAGS -Wno-uninitialized)
|
||||
|
|
|
|||
|
|
@ -67,6 +67,9 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst,
|
|||
} else if (src_device.Type() == OrtDevice::GPU) {
|
||||
// copying between GPU, this is non-blocking
|
||||
HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToDevice, static_cast<hipStream_t>(stream.GetHandle())));
|
||||
} else {
|
||||
// copy from other CPU memory to GPU, this is blocking
|
||||
HIP_CALL_THROW(hipMemcpyWithStream(dst_data, src_data, bytes, hipMemcpyHostToDevice, static_cast<hipStream_t>(stream.GetHandle())));
|
||||
}
|
||||
} else if (src_device.Type() == OrtDevice::GPU) {
|
||||
// If dest are not pinned, the memory copy will be performed synchronously.
|
||||
|
|
|
|||
|
|
@ -821,6 +821,7 @@ GetUnsupportedNodeIndices(const GraphViewer& graph_viewer,
|
|||
"DequantizeLinear",
|
||||
"Div",
|
||||
"Dropout",
|
||||
"Einsum",
|
||||
"Elu",
|
||||
"Equal",
|
||||
"Erf",
|
||||
|
|
|
|||
|
|
@ -53,8 +53,27 @@ __global__ void CopyVectorBFloat16(const onnxruntime::BFloat16* x, int incx, onn
|
|||
|
||||
} // namespace
|
||||
|
||||
dim3 hipblasTransposeHelperDimGrid(int m, int n) {
|
||||
return dim3((n + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, (m + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, 1);
|
||||
}
|
||||
|
||||
// hipblasTransposeHelper can only be used if it won't overflow the maxGridSize y dimension size
|
||||
__host__ bool CanUse_hipblasTransposeHelper_MLFloat16(int m, int n) {
|
||||
dim3 dimGrid = hipblasTransposeHelperDimGrid(m, n);
|
||||
|
||||
int deviceId;
|
||||
hipError_t hipError = hipGetDevice(&deviceId);
|
||||
if (hipError != 0) return false;
|
||||
|
||||
hipDeviceProp_t deviceProp;
|
||||
hipError = hipGetDeviceProperties(&deviceProp, deviceId);
|
||||
if (hipError != 0) return false;
|
||||
|
||||
return dimGrid.y < deviceProp.maxGridSize[1];
|
||||
}
|
||||
|
||||
hipblasStatus_t hipblasTransposeHelper(hipStream_t stream, hipblasHandle_t, hipblasOperation_t , hipblasOperation_t , int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int) {
|
||||
if (C != A) {
|
||||
if (C != A) {
|
||||
dim3 dimGrid((n + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, (m + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, 1);
|
||||
dim3 dimBlock(TRANS_TILE_DIM, BLOCK_ROWS, 1);
|
||||
|
||||
|
|
@ -73,7 +92,7 @@ hipblasStatus_t hipblasCopyHelper(hipStream_t stream, hipblasHandle_t, int n, co
|
|||
}
|
||||
|
||||
hipblasStatus_t hipblasCopyHelper(hipStream_t stream, hipblasHandle_t, int n, const onnxruntime::BFloat16* x, int incx,
|
||||
onnxruntime::BFloat16* y, int incy) {
|
||||
onnxruntime::BFloat16* y, int incy) {
|
||||
dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1);
|
||||
dim3 dimBlock(COPY_BLOCK_DIM, 1, 1);
|
||||
CopyVectorBFloat16<<<dimGrid, dimBlock, 0, stream>>>(x, incx, y, incy, n);
|
||||
|
|
|
|||
|
|
@ -955,3 +955,5 @@ inline rocblas_status rocblasGemmStridedBatchedHelper(rocblas_handle handle,
|
|||
C, ldc, strideC,
|
||||
batchCount);
|
||||
}
|
||||
bool CanUse_hipblasTransposeHelper_MLFloat16(int m, int n);
|
||||
hipblasStatus_t hipblasTransposeHelper(hipStream_t stream, rocblas_handle, rocblas_operation, rocblas_operation, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int);
|
||||
|
|
|
|||
|
|
@ -117,6 +117,7 @@ def run_onnxruntime(
|
|||
if (
|
||||
use_gpu
|
||||
and ("CUDAExecutionProvider" not in onnxruntime.get_available_providers())
|
||||
and ("MIGraphXExecutionProvider" not in onnxruntime.get_available_providers())
|
||||
and ("ROCMExecutionProvider" not in onnxruntime.get_available_providers())
|
||||
and ("DmlExecutionProvider" not in onnxruntime.get_available_providers())
|
||||
):
|
||||
|
|
|
|||
|
|
@ -57,7 +57,8 @@ struct ATenOperator {
|
|||
c10::IValue i_value;
|
||||
// Create the torch tensor from this DLPack no matter we need it or not below,
|
||||
// so that the dlpack's deleter will be triggered when torch tensor is out of scope.
|
||||
at::Tensor tensor = at::fromDLPack(dlpack);
|
||||
// work-around upstream pytorch changing fromDLPack to take non-const pointer
|
||||
at::Tensor tensor = at::fromDLPack(const_cast<DLManagedTensor*>(dlpack));
|
||||
switch (elem_kinds[index]) {
|
||||
case c10::TypeKind::TensorType: {
|
||||
i_value = is_optional ? c10::IValue(c10::optional<at::Tensor>(tensor)) : c10::IValue(tensor);
|
||||
|
|
|
|||
|
|
@ -524,6 +524,7 @@ static void RunMultiHeadAttentionTests(AttentionTestData& data,
|
|||
// Test fused cross attention kernel
|
||||
// It requires head_size > 32 and head_size <= 64 for T4 GPU; hidden_size == v_hidden_size.
|
||||
TEST(MultiHeadAttentionTest, CrossAttention_Batch2_HeadSize40) {
|
||||
ROCM_GTEST_SKIP("ROCm MHA skip - missing support for ROCm on Radeon");
|
||||
AttentionTestData data;
|
||||
GetCrossAttentionData_HeadSize40(data);
|
||||
RunMultiHeadAttentionTests(data);
|
||||
|
|
@ -543,6 +544,7 @@ TEST(MultiHeadAttentionTest, CrossAttention_Batch2_HeadSize32_RightSidePadding_M
|
|||
}
|
||||
|
||||
TEST(MultiHeadAttentionTest, CrossAttention_Batch2_HeadSize32_RightSidePadding_Mask2D) {
|
||||
ROCM_GTEST_SKIP("ROCm MHA skip - missing support for ROCm on Radeon");
|
||||
AttentionTestData data;
|
||||
GetCrossAttentionData_Batch2_HeadSize32_RightSidePadding(data, false);
|
||||
RunMultiHeadAttentionTests(data, DISABLE_CPU | DISABLE_WEBGPU);
|
||||
|
|
@ -552,6 +554,7 @@ TEST(MultiHeadAttentionTest, CrossAttention_Batch2_HeadSize32_RightSidePadding_M
|
|||
}
|
||||
|
||||
TEST(MultiHeadAttentionTest, CrossAttention_Batch1_HeadSize32_LeftSidePadding_Mask2D) {
|
||||
ROCM_GTEST_SKIP("ROCm MHA skip - missing support for ROCm on Radeon");
|
||||
AttentionTestData data;
|
||||
GetCrossAttentionData_Batch1_HeadSize32_LeftSidePadding(data);
|
||||
RunMultiHeadAttentionTests(data, DISABLE_CPU | DISABLE_WEBGPU);
|
||||
|
|
@ -561,12 +564,14 @@ TEST(MultiHeadAttentionTest, CrossAttention_Batch1_HeadSize32_LeftSidePadding_Ma
|
|||
}
|
||||
|
||||
TEST(MultiHeadAttentionTest, CrossAttention_Batch2_HeadSize32_NoBias_NoMask_PackedKV) {
|
||||
ROCM_GTEST_SKIP("ROCm MHA skip - missing support for ROCm on Radeon");
|
||||
AttentionTestData data;
|
||||
GetCrossAttentionData_Batch2_HeadSize32_NoBias_NoMask_PackedKV(data);
|
||||
RunMultiHeadAttentionTests(data, DISABLE_WEBGPU);
|
||||
}
|
||||
|
||||
TEST(MultiHeadAttentionTest, SelfAttention_Batch2_HeadSize32_NoBias_NoMask_PackedQKV) {
|
||||
ROCM_GTEST_SKIP("ROCm MHA skip - missing support for ROCm on Radeon");
|
||||
AttentionTestData data;
|
||||
GetSelfAttentionData_Batch2_HeadSize32_NoBias_NoMask_PackedQKV(data);
|
||||
RunMultiHeadAttentionTests(data, DISABLE_WEBGPU);
|
||||
|
|
@ -574,6 +579,7 @@ TEST(MultiHeadAttentionTest, SelfAttention_Batch2_HeadSize32_NoBias_NoMask_Packe
|
|||
|
||||
// This tests qk_head_size != v_head_size
|
||||
TEST(MultiHeadAttentionTest, CrossAttention_Batch2_HeadSize16_8) {
|
||||
ROCM_GTEST_SKIP("ROCm MHA skip - missing support for ROCm on Radeon");
|
||||
AttentionTestData data;
|
||||
GetCrossAttentionData_HeadSize16_8(data);
|
||||
RunMultiHeadAttentionTests(data);
|
||||
|
|
@ -583,6 +589,7 @@ TEST(MultiHeadAttentionTest, CrossAttention_Batch2_HeadSize16_8) {
|
|||
}
|
||||
|
||||
TEST(MultiHeadAttentionTest, CrossAttention_Batch1_HeadSize16) {
|
||||
ROCM_GTEST_SKIP("ROCm MHA skip - missing support for ROCm on Radeon");
|
||||
AttentionTestData data;
|
||||
GetCrossAttentionData_HeadSize16(data);
|
||||
RunMultiHeadAttentionTests(data);
|
||||
|
|
@ -615,7 +622,8 @@ TEST(MultiHeadAttentionTest, SelfAttention_WithPast_WithAttnBias_ForT5) {
|
|||
RunMultiHeadAttentionTests(data, DISABLE_CPU);
|
||||
}
|
||||
|
||||
TEST(MultiHeadAttentionTest, AttentionCutlassAttnBias) {
|
||||
TEST(MultiHeadAttentionTest, AttentionCutlassRelPosBias) {
|
||||
ROCM_GTEST_SKIP("ROCm MHA skip - missing support for ROCm on Radeon");
|
||||
// ROCM_GTEST_SKIP("ROCm does not support cutlass");
|
||||
AttentionTestData data;
|
||||
GetAttentionDataCutlassAttnBias(data);
|
||||
|
|
@ -623,6 +631,7 @@ TEST(MultiHeadAttentionTest, AttentionCutlassAttnBias) {
|
|||
}
|
||||
|
||||
TEST(MultiHeadAttentionTest, CrossAttention_DiffSequenceLengths) {
|
||||
ROCM_GTEST_SKIP("ROCm MHA skip - missing support for ROCm on Radeon");
|
||||
// Whisper decoder cross attention without mask and different sequence lengths for Q and K/V
|
||||
AttentionTestData data;
|
||||
GetCrossAttentionData_DiffSequenceLengths(data);
|
||||
|
|
@ -635,7 +644,8 @@ TEST(MultiHeadAttentionTest, CrossAttention_DiffSequenceLengths) {
|
|||
RunMultiHeadAttentionTests(data, DISABLE_CUDA | DISABLE_WEBGPU);
|
||||
}
|
||||
|
||||
TEST(MultiHeadAttentionTest, SelfAttention_WithPastAndPresent_NoMask_NoAttnBias) {
|
||||
TEST(MultiHeadAttentionTest, SelfAttention_WithPastAndPresent_NoMask_NoRelPosBias) {
|
||||
ROCM_GTEST_SKIP("ROCm MHA skip - missing support for ROCm on Radeon");
|
||||
// Whisper decoder self attention with past_kv and present_kv
|
||||
AttentionTestData data;
|
||||
GetSelfAttentionData_WithPastAndPresent_NoMask_NoAttnBias(data);
|
||||
|
|
|
|||
|
|
@ -514,10 +514,6 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
|
|||
} else if (provider_name_ == onnxruntime::kMIGraphXExecutionProvider) {
|
||||
#ifdef USE_MIGRAPHX
|
||||
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(session_options, 0));
|
||||
OrtROCMProviderOptions rocm_options;
|
||||
rocm_options.miopen_conv_exhaustive_search = performance_test_config.run_config.cudnn_conv_algo;
|
||||
rocm_options.do_copy_in_default_stream = !performance_test_config.run_config.do_cuda_copy_in_separate_stream;
|
||||
session_options.AppendExecutionProvider_ROCM(rocm_options);
|
||||
#else
|
||||
ORT_THROW("MIGraphX is not supported in this build\n");
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -215,14 +215,22 @@ TEST(CudaKernelTest, SoftmaxGrad_LargeTensor_LastAxis_Float16) {
|
|||
std::vector<int64_t> dY_dims{8, 16, 2048};
|
||||
std::vector<int64_t> Y_dims{8, 16, 2048};
|
||||
std::vector<int64_t> dX_dims{8, 16, 2048};
|
||||
#if USE_ROCM
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 2, false, 1.5e-2, 1.5e-2);
|
||||
#else
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 2, false, 1e-3, 1e-3);
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST(CudaKernelTest, SoftmaxGrad_LargeTensor_LastAxis_Float16_NoPowerOfTwo) {
|
||||
std::vector<int64_t> dY_dims{8, 16, 1500};
|
||||
std::vector<int64_t> Y_dims{8, 16, 1500};
|
||||
std::vector<int64_t> dX_dims{8, 16, 1500};
|
||||
#if USE_ROCM
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 2, false, 1.7e-2, 1.7e-2);
|
||||
#else
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 2, false, 1e-3, 1e-3);
|
||||
#endif
|
||||
}
|
||||
|
||||
// large tensor to check cuda DNN softmax backward
|
||||
|
|
@ -238,16 +246,26 @@ TEST(CudaKernelTest, SoftmaxGrad_LargeTensor_AllAxis_Float16) {
|
|||
std::vector<int64_t> dY_dims{8, 16, 512};
|
||||
std::vector<int64_t> Y_dims{8, 16, 512};
|
||||
std::vector<int64_t> dX_dims{8, 16, 512};
|
||||
#if USE_ROCM
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 0, false, 1.5e-2, 1.5e-2);
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 1, false, 1.5e-2, 1.5e-2);
|
||||
#else
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 0, false, 1e-3, 1e-3);
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 1, false, 1e-3, 1e-3);
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST(CudaKernelTest, SoftmaxGrad_LargeTensor_AllAxis_Float16_NoPowerOfTwo) {
|
||||
std::vector<int64_t> dY_dims{8, 16, 1500};
|
||||
std::vector<int64_t> Y_dims{8, 16, 1500};
|
||||
std::vector<int64_t> dX_dims{8, 16, 1500};
|
||||
#if USE_ROCM
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 0, false, 2.5e-2, 2.5e-2);
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 1, false, 2.5e-2, 2.5e-2);
|
||||
#else
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 0, false, 1e-3, 1e-3);
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 1, false, 1e-3, 1e-3);
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST(CudaKernelTest, LogSoftmaxGrad_SmallTensor_LastAxis) {
|
||||
|
|
@ -276,14 +294,23 @@ TEST(CudaKernelTest, LogSoftmaxGrad_LargeTensor_LastAxis_Float16) {
|
|||
std::vector<int64_t> dY_dims{8, 16, 2048};
|
||||
std::vector<int64_t> Y_dims{8, 16, 2048};
|
||||
std::vector<int64_t> dX_dims{8, 16, 2048};
|
||||
#if USE_ROCM
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 2, true, 3.5e-2, 3.5e-2);
|
||||
#else
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 2, true, 1e-3, 1e-3);
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST(CudaKernelTest, LogSoftmaxGrad_LargeTensor_LastAxis_Float16_NoPowerOfTwo) {
|
||||
std::vector<int64_t> dY_dims{8, 16, 1500};
|
||||
std::vector<int64_t> Y_dims{8, 16, 1500};
|
||||
std::vector<int64_t> dX_dims{8, 16, 1500};
|
||||
#if USE_ROCM
|
||||
// FIXME: Excessive numerical errors
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 2, true, 1.0, 5e-2);
|
||||
#else
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 2, true, 1e-3, 1e-3);
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST(CudaKernelTest, LogSoftmaxGrad_LargeTensor_AllAxis) {
|
||||
|
|
@ -298,16 +325,26 @@ TEST(CudaKernelTest, LogSoftmaxGrad_LargeTensor_AllAxis_Float16) {
|
|||
std::vector<int64_t> dY_dims{8, 16, 512};
|
||||
std::vector<int64_t> Y_dims{8, 16, 512};
|
||||
std::vector<int64_t> dX_dims{8, 16, 512};
|
||||
#if USE_ROCM
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 0, true, 1.5e-2, 1.5e-2);
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 1, true, 1.5e-2, 1.5e-2);
|
||||
#else
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 0, true, 1e-3, 1e-3);
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 1, true, 1e-3, 1e-3);
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST(CudaKernelTest, LogSoftmaxGrad_LargeTensor_AllAxis_Float16_NoPowerOfTwo) {
|
||||
std::vector<int64_t> dY_dims{8, 16, 1500};
|
||||
std::vector<int64_t> Y_dims{8, 16, 1500};
|
||||
std::vector<int64_t> dX_dims{8, 16, 1500};
|
||||
#if USE_ROCM
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 0, true, 4.5e-2, 4.5e-2);
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 1, true, 4.5e-2, 4.5e-2);
|
||||
#else
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 0, true, 1e-3, 1e-3);
|
||||
TestSoftmaxGrad<MLFloat16>(dY_dims, Y_dims, dX_dims, 1, true, 1e-3, 1e-3);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void TestSoftmaxGrad_13(const std::vector<int64_t>& dY_dims,
|
||||
|
|
|
|||
8
setup.py
8
setup.py
|
|
@ -66,6 +66,8 @@ if wheel_name_suffix == "gpu":
|
|||
elif parse_arg_remove_boolean(sys.argv, "--use_rocm"):
|
||||
is_rocm = True
|
||||
rocm_version = parse_arg_remove_string(sys.argv, "--rocm_version=")
|
||||
if parse_arg_remove_boolean(sys.argv, "--use_migraphx"):
|
||||
is_migraphx = True
|
||||
elif parse_arg_remove_boolean(sys.argv, "--use_migraphx"):
|
||||
is_migraphx = True
|
||||
elif parse_arg_remove_boolean(sys.argv, "--use_openvino"):
|
||||
|
|
@ -90,8 +92,10 @@ elif parse_arg_remove_boolean(sys.argv, "--use_qnn"):
|
|||
is_qnn = True
|
||||
package_name = "onnxruntime-qnn"
|
||||
|
||||
if is_rocm or is_migraphx:
|
||||
package_name = "onnxruntime-rocm"
|
||||
if is_rocm:
|
||||
package_name = "onnxruntime-rocm" if not nightly_build else "ort-rocm-nightly"
|
||||
elif is_migraphx:
|
||||
package_name = "onnxruntime-migraphx" if not nightly_build else "ort-migraphx-nightly"
|
||||
|
||||
# PEP 513 defined manylinux1_x86_64 and manylinux1_i686
|
||||
# PEP 571 defined manylinux2010_x86_64 and manylinux2010_i686
|
||||
|
|
|
|||
|
|
@ -187,4 +187,4 @@ if __name__ == "__main__":
|
|||
parser.add_argument("src", help="src")
|
||||
args = parser.parse_args()
|
||||
|
||||
hipify(args.hipify_perl, args.src, args.output)
|
||||
hipify(os.path.join(os.path.dirname(__file__), "hipify-perl"), args.src, args.output)
|
||||
|
|
|
|||
|
|
@ -2320,6 +2320,8 @@ def build_python_wheel(
|
|||
args.append("--use_rocm")
|
||||
if rocm_version:
|
||||
args.append(f"--rocm_version={rocm_version}")
|
||||
if use_migraphx:
|
||||
args.append("--use_migraphx")
|
||||
elif use_migraphx:
|
||||
args.append("--use_migraphx")
|
||||
elif use_openvino:
|
||||
|
|
|
|||
13168
tools/ci_build/hipify-perl
Executable file
13168
tools/ci_build/hipify-perl
Executable file
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue