Initialize device_id in cuda_call & rocm_call (#20933)

### Description
<!-- Describe your changes. -->

Initialize `device_id` with `-1` in  `cuda_call` and `rocm_call`.

### Motivation and Context

From PyTorch code:
bb2de3b101/c10/cuda/CUDAFunctions.cpp (L217-L324)

If `cudaGetDevice` or `hipGetDevice` failed, an uninitialized `int`
would produce a random number that changes during each run:

```text
[with ERRTYPE = hipError_t; bool THRW = true; std::conditional_t<THRW, void, common::Status> = void] HIP failure 101: invalid device ordinal ; GPU=32741 ; hostname=e6724be2a31a ; file=/onnxruntime_src/onnxruntime/core/providers/rocm/rocm_common.h ; line=66 ; expr=hipGetDeviceProperties(&deviceProp, 0); 
```

Notice the `GPU` value above. Using `-1` would clearly indicate such
failure and avoid confusion.
This commit is contained in:
Chester Liu 2024-06-06 11:19:09 +08:00 committed by GitHub
parent b5eb9e8a8a
commit eb2ec66716
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 2 additions and 2 deletions

View file

@ -103,7 +103,7 @@ std::conditional_t<THRW, void, Status> CudaCall(
if (gethostname(hostname, HOST_NAME_MAX) != 0)
strcpy(hostname, "?");
#endif
int currentCudaDevice;
int currentCudaDevice = -1;
cudaGetDevice(&currentCudaDevice);
cudaGetLastError(); // clear last CUDA error
static char str[1024];

View file

@ -104,7 +104,7 @@ std::conditional_t<THRW, void, Status> RocmCall(
if (gethostname(hostname, HOST_NAME_MAX) != 0)
strcpy(hostname, "?");
#endif
int currentHipDevice;
int currentHipDevice = -1;
ORT_IGNORE_RETURN_VALUE(hipGetDevice(&currentHipDevice)); // void to silence nodiscard
ORT_IGNORE_RETURN_VALUE(hipGetLastError()); // clear last ROCM error; void to silence nodiscard
static char str[1024];