onnxruntime/cmake/patches/cutlass/cutlass_3.5.0.patch

diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
index 964d2ff3..b366bc14 100644
--- a/include/cutlass/functional.h
+++ b/include/cutlass/functional.h
@@ -39,6 +39,7 @@
 #include "cutlass/numeric_types.h"

 #include <cuda_runtime.h>
+#include <cuda_fp16.h>

 #if defined(CUTLASS_ARCH_WMMA_ENABLED)
 #include <mma.h>
@@ -230,8 +231,12 @@ struct inverse_square_root<half_t> {
   CUTLASS_HOST_DEVICE
   half_t operator()(half_t const &lhs) const {
 #if defined(__CUDA_ARCH__)
+#if (__CUDA_ARCH__ >= 530)
     auto result = hrsqrt(reinterpret_cast<__half const &>(lhs));
     return reinterpret_cast<half_t const &>(result);
+#else
+    return half_t::convert((rsqrtf(half_t::convert(lhs))));
+#endif
 #else
     return half_t(1.f / std::sqrt(half_t::convert(lhs)));
 #endif