From eb2ec667166a4b4a202cd30ebdb5e147b2013350 Mon Sep 17 00:00:00 2001 From: Chester Liu <4710575+skyline75489@users.noreply.github.com> Date: Thu, 6 Jun 2024 11:19:09 +0800 Subject: [PATCH] Initialize device_id in cuda_call & rocm_call (#20933) ### Description Initialize `device_id` with `-1` in `cuda_call` and `rocm_call`. ### Motivation and Context From PyTorch code: https://github.com/pytorch/pytorch/blob/bb2de3b10120f91afce8da6233094076713f673d/c10/cuda/CUDAFunctions.cpp#L217-L324 If `cudaGetDevice` or `hipGetDevice` failed, an uninitialized `int` would produce a random number that changes during each run: ```text [with ERRTYPE = hipError_t; bool THRW = true; std::conditional_t = void] HIP failure 101: invalid device ordinal ; GPU=32741 ; hostname=e6724be2a31a ; file=/onnxruntime_src/onnxruntime/core/providers/rocm/rocm_common.h ; line=66 ; expr=hipGetDeviceProperties(&deviceProp, 0); ``` Notice the `GPU` value above. Using `-1` would clearly indicate such failure and avoid confusion. --- onnxruntime/core/providers/cuda/cuda_call.cc | 2 +- onnxruntime/core/providers/rocm/rocm_call.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/cuda/cuda_call.cc b/onnxruntime/core/providers/cuda/cuda_call.cc index f60684795a..c73b23f376 100644 --- a/onnxruntime/core/providers/cuda/cuda_call.cc +++ b/onnxruntime/core/providers/cuda/cuda_call.cc @@ -103,7 +103,7 @@ std::conditional_t CudaCall( if (gethostname(hostname, HOST_NAME_MAX) != 0) strcpy(hostname, "?"); #endif - int currentCudaDevice; + int currentCudaDevice = -1; cudaGetDevice(¤tCudaDevice); cudaGetLastError(); // clear last CUDA error static char str[1024]; diff --git a/onnxruntime/core/providers/rocm/rocm_call.cc b/onnxruntime/core/providers/rocm/rocm_call.cc index 484e59f4de..7974053c32 100644 --- a/onnxruntime/core/providers/rocm/rocm_call.cc +++ b/onnxruntime/core/providers/rocm/rocm_call.cc @@ -104,7 +104,7 @@ std::conditional_t RocmCall( if (gethostname(hostname, HOST_NAME_MAX) != 0) strcpy(hostname, "?"); #endif - int currentHipDevice; + int currentHipDevice = -1; ORT_IGNORE_RETURN_VALUE(hipGetDevice(¤tHipDevice)); // void to silence nodiscard ORT_IGNORE_RETURN_VALUE(hipGetLastError()); // clear last ROCM error; void to silence nodiscard static char str[1024];