From c4b49fb7bf340a0d27c7d8e2cb2508cac7f57ccf Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 6 Feb 2024 12:48:39 -0800 Subject: [PATCH] [CUDA] remove CUBLAS_TENSOR_OP_MATH mode (#19431) This pull request replaces `CUBLAS_TENSOR_OP_MATH` with `CUBLAS_DEFAULT_MATH`. The changes affect several files, including test cases and a Python script for AMD hipify process. ### Motivation and Context CUBLAS_TENSOR_OP_MATH mode is deprecated: https://docs.nvidia.com/cuda/cublas/index.html#cublasmath-t On CUDA versions prior to 11, users are required to set the math mode to CUBLAS_TENSOR_OP_MATH manually to be able to use tensor cores for FP16. On CUDA 11 and CUDA 12, this is no longer required. Since latest ORT only supports CUDA >= 11 so it is safe to remove CUBLAS_TENSOR_OP_MATH from our code base. --- .../cuda/bert/longformer_attention_impl.cu | 1 - onnxruntime/core/providers/cuda/cuda_common.h | 3 +-- .../providers/cuda/test_cases/gemm_options_test.cc | 12 ++++++------ tools/ci_build/amd_hipify.py | 1 - 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu index f002394600..c9c66b73b3 100644 --- a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu @@ -1005,7 +1005,6 @@ Status LaunchLongformerAttentionKernel( bool disable_compact_memory, bool use_merged_qkv_weights, bool use_half4) { - CublasMathModeSetter helper(device_prop, cublas, CUBLAS_TENSOR_OP_MATH); size_t softmax_workspace_size = GetLongformerSoftmaxWorkspaceSize(element_size, batch_size, num_heads, diff --git a/onnxruntime/core/providers/cuda/cuda_common.h b/onnxruntime/core/providers/cuda/cuda_common.h index e9941ce743..41c999bace 100644 --- a/onnxruntime/core/providers/cuda/cuda_common.h +++ b/onnxruntime/core/providers/cuda/cuda_common.h @@ -141,8 +141,7 @@ class HalfGemmOptions { } #else cublasMath_t GetMathMode() const { - // CublasMathModeSetter will check whether device has tensor cores later. - return CUBLAS_TENSOR_OP_MATH; + return CUBLAS_DEFAULT_MATH; } cudaDataType GetComputeType() const { diff --git a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc index 6cac23f144..4917701e51 100644 --- a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc @@ -17,7 +17,7 @@ TEST(CudaGemmOptions, TestDefaultOptions) { EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH); EXPECT_EQ(gemm_options.GetComputeType(), CUBLAS_COMPUTE_32F); #else - EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_TENSOR_OP_MATH); + EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH); EXPECT_EQ(gemm_options.GetComputeType(), CUDA_R_32F); #endif } @@ -30,7 +30,7 @@ TEST(CudaGemmOptions, TestCompute16F) { EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH); EXPECT_EQ(gemm_options.GetComputeType(), CUBLAS_COMPUTE_16F); #else - EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_TENSOR_OP_MATH); + EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH); EXPECT_EQ(gemm_options.GetComputeType(), CUDA_R_16F); #endif } @@ -43,7 +43,7 @@ TEST(CudaGemmOptions, NoReducedPrecision) { EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); EXPECT_EQ(gemm_options.GetComputeType(), CUBLAS_COMPUTE_32F); #else - EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_TENSOR_OP_MATH); + EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH); EXPECT_EQ(gemm_options.GetComputeType(), CUDA_R_32F); #endif } @@ -56,7 +56,7 @@ TEST(CudaGemmOptions, Pedantic) { EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_PEDANTIC_MATH); EXPECT_EQ(gemm_options.GetComputeType(), CUBLAS_COMPUTE_32F_PEDANTIC); #else - EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_TENSOR_OP_MATH); + EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH); EXPECT_EQ(gemm_options.GetComputeType(), CUDA_R_32F); #endif } @@ -69,7 +69,7 @@ TEST(CudaGemmOptions, Compute16F_Pedantic) { EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_PEDANTIC_MATH); EXPECT_EQ(gemm_options.GetComputeType(), CUBLAS_COMPUTE_16F_PEDANTIC); #else - EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_TENSOR_OP_MATH); + EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH); EXPECT_EQ(gemm_options.GetComputeType(), CUDA_R_16F); #endif } @@ -82,7 +82,7 @@ TEST(CudaGemmOptions, Compute16F_NoReducedPrecision) { EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH); EXPECT_EQ(gemm_options.GetComputeType(), CUBLAS_COMPUTE_16F); #else - EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_TENSOR_OP_MATH); + EXPECT_EQ(gemm_options.GetMathMode(), CUBLAS_DEFAULT_MATH); EXPECT_EQ(gemm_options.GetComputeType(), CUDA_R_16F); #endif } diff --git a/tools/ci_build/amd_hipify.py b/tools/ci_build/amd_hipify.py index 8ea0481c9b..e286236ba6 100644 --- a/tools/ci_build/amd_hipify.py +++ b/tools/ci_build/amd_hipify.py @@ -117,7 +117,6 @@ def hipify(hipify_perl_path, src_file_path, dst_file_path): s = s.replace("HIPBLAS_R_16F", "rocblas_datatype_f16_r") s = s.replace("HIPBLAS_R_32F", "rocblas_datatype_f32_r") s = s.replace("ROCBLAS_GEMM_DEFAULT_TENSOR_OP", "rocblas_gemm_algo_standard") - s = s.replace("ROCBLAS_TENSOR_OP_MATH", "0 /* CUBLAS_TENSOR_OP_MATH is deprecated */") # compatible layer s = s.replace("rocblas_gemm_strided_batched_ex", "_compat_rocblas_gemm_strided_batched_ex")