diff --git a/onnxruntime/core/providers/cuda/fpgeneric.cu b/onnxruntime/core/providers/cuda/fpgeneric.cu index 5f8314d325..5c0df6332d 100644 --- a/onnxruntime/core/providers/cuda/fpgeneric.cu +++ b/onnxruntime/core/providers/cuda/fpgeneric.cu @@ -22,14 +22,19 @@ // kernel(s) for half functions with no library support namespace { +// TODO - refactor the function with similar logic in Transpose3DKernel using 16x16 Tile __global__ void transposeNoOverlap(half* odata, const half* idata, const int m, const int n) { __shared__ half tile[TRANS_TILE_DIM][TRANS_TILE_DIM + 1]; int x = blockIdx.x * TRANS_TILE_DIM + threadIdx.x; int y = blockIdx.y * TRANS_TILE_DIM + threadIdx.y; - for (int j = 0; j < TRANS_TILE_DIM; j += BLOCK_ROWS) - tile[threadIdx.y + j][threadIdx.x] = idata[(y + j) * m + x]; + if (x < m) { + for (int j = 0; j < TRANS_TILE_DIM; j += BLOCK_ROWS) { + if (j >= (n - y)) continue; + tile[threadIdx.y + j][threadIdx.x] = idata[(y + j) * m + x]; + } + } __syncthreads();