diff --git a/BUILD.bazel b/BUILD.bazel index 88ba8d66c6a..843b27a8f83 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -414,6 +414,7 @@ cc_library( torch_cuda_half_options = [ "-DCUDA_HAS_FP16=1", "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", "-D__CUDA_NO_BFLOAT16_CONVERSIONS__", "-D__CUDA_NO_HALF2_OPERATORS__", ] diff --git a/aten/src/ATen/native/cuda/KernelUtils.cuh b/aten/src/ATen/native/cuda/KernelUtils.cuh index ec7292f03d0..e1b9f380723 100644 --- a/aten/src/ATen/native/cuda/KernelUtils.cuh +++ b/aten/src/ATen/native/cuda/KernelUtils.cuh @@ -49,14 +49,14 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd( if (low_byte && index < (numel - 1)) { __half2 value2; - value2.x = static_cast<__half>(value); + value2.x = value; value2.y = __int2half_rz(0); atomicAdd(reinterpret_cast<__half2*>(target_addr), value2); } else if (!low_byte && index > 0) { __half2 value2; value2.x = __int2half_rz(0); - value2.y = static_cast<__half>(value); + value2.y = value; atomicAdd(reinterpret_cast<__half2*>(target_addr - 1), value2); } else { diff --git a/aten/src/ATen/test/cuda_half_test.cu b/aten/src/ATen/test/cuda_half_test.cu index d6d7e8a93f5..aa1644c94b7 100644 --- a/aten/src/ATen/test/cuda_half_test.cu +++ b/aten/src/ATen/test/cuda_half_test.cu @@ -21,7 +21,7 @@ __device__ void test(){ __half a = __float2half(3.0f); __half b = __float2half(2.0f); - __half c = Half(a) - Half(b); + __half c = a - Half(b); assert(static_cast(c) == Half(1.0)); // asserting if the functions used on diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 8c462031550..0012d26acaa 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1653,6 +1653,7 @@ if(NOT INTERN_BUILD_MOBILE) message(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor") string(APPEND CMAKE_CUDA_FLAGS " -DCUDA_HAS_FP16=1" " -D__CUDA_NO_HALF_OPERATORS__" + " -D__CUDA_NO_HALF_CONVERSIONS__" " -D__CUDA_NO_HALF2_OPERATORS__" " -D__CUDA_NO_BFLOAT16_CONVERSIONS__") diff --git a/third_party/cutlass b/third_party/cutlass index 66d9cddc832..b72cbf957df 160000 --- a/third_party/cutlass +++ b/third_party/cutlass @@ -1 +1 @@ -Subproject commit 66d9cddc832c1cdc2b30a8755274f7f74640cfe6 +Subproject commit b72cbf957df8cf84a6d0ff91c190ad51a9c1d24a diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index 54e7fa98f12..11b233f2712 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -225,6 +225,7 @@ MSVC_IGNORE_CUDAFE_WARNINGS = [ COMMON_NVCC_FLAGS = [ '-D__CUDA_NO_HALF_OPERATORS__', + '-D__CUDA_NO_HALF_CONVERSIONS__', '-D__CUDA_NO_BFLOAT16_CONVERSIONS__', '-D__CUDA_NO_HALF2_OPERATORS__', '--expt-relaxed-constexpr'