mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-18 21:21:17 +00:00
Initialize device_id in cuda_call & rocm_call (#20933)
### Description
<!-- Describe your changes. -->
Initialize `device_id` with `-1` in `cuda_call` and `rocm_call`.
### Motivation and Context
From PyTorch code:
bb2de3b101/c10/cuda/CUDAFunctions.cpp (L217-L324)
If `cudaGetDevice` or `hipGetDevice` failed, an uninitialized `int`
would produce a random number that changes during each run:
```text
[with ERRTYPE = hipError_t; bool THRW = true; std::conditional_t<THRW, void, common::Status> = void] HIP failure 101: invalid device ordinal ; GPU=32741 ; hostname=e6724be2a31a ; file=/onnxruntime_src/onnxruntime/core/providers/rocm/rocm_common.h ; line=66 ; expr=hipGetDeviceProperties(&deviceProp, 0);
```
Notice the `GPU` value above. Using `-1` would clearly indicate such
failure and avoid confusion.
This commit is contained in:
parent
b5eb9e8a8a
commit
eb2ec66716
2 changed files with 2 additions and 2 deletions
|
|
@ -103,7 +103,7 @@ std::conditional_t<THRW, void, Status> CudaCall(
|
|||
if (gethostname(hostname, HOST_NAME_MAX) != 0)
|
||||
strcpy(hostname, "?");
|
||||
#endif
|
||||
int currentCudaDevice;
|
||||
int currentCudaDevice = -1;
|
||||
cudaGetDevice(¤tCudaDevice);
|
||||
cudaGetLastError(); // clear last CUDA error
|
||||
static char str[1024];
|
||||
|
|
|
|||
|
|
@ -104,7 +104,7 @@ std::conditional_t<THRW, void, Status> RocmCall(
|
|||
if (gethostname(hostname, HOST_NAME_MAX) != 0)
|
||||
strcpy(hostname, "?");
|
||||
#endif
|
||||
int currentHipDevice;
|
||||
int currentHipDevice = -1;
|
||||
ORT_IGNORE_RETURN_VALUE(hipGetDevice(¤tHipDevice)); // void to silence nodiscard
|
||||
ORT_IGNORE_RETURN_VALUE(hipGetLastError()); // clear last ROCM error; void to silence nodiscard
|
||||
static char str[1024];
|
||||
|
|
|
|||
Loading…
Reference in a new issue