diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc index ce458b60b4..0cc021cdd2 100644 --- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc +++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc @@ -292,7 +292,7 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { zero_seq_index_cache_size = zero_seq_count * num_directions_; zero_seq_index_cache.resize(zero_seq_index_cache_size); for (int64_t i = 0; i < zero_seq_count; ++i) { - zero_seq_index_cache[zero_seq_count + i] = static_cast(batch_size + zero_seq_index_cache[i]); + zero_seq_index_cache[static_cast(zero_seq_count) + i] = static_cast(batch_size + zero_seq_index_cache[i]); } } diff --git a/onnxruntime/core/providers/cuda/tensor/where.cc b/onnxruntime/core/providers/cuda/tensor/where.cc index 76baca5c4d..0b0c1f4a1d 100644 --- a/onnxruntime/core/providers/cuda/tensor/where.cc +++ b/onnxruntime/core/providers/cuda/tensor/where.cc @@ -115,8 +115,8 @@ struct TernaryElementwisePreparation { auto offset = out_rank - rank; for (auto i = offset; i < out_rank; ++i) { // the stride for broadcast dimension is kept as 0 - if (shape.GetDims()[i - offset] != 1) { - padded_strides[i] = pitches[i - offset]; + if (shape.GetDims()[gsl::narrow_cast(i) - offset] != 1) { + padded_strides[i] = pitches[gsl::narrow_cast(i) - offset]; } } } diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc index 9ae26d2271..e4e26039e6 100644 --- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc +++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc @@ -160,14 +160,18 @@ AllocatorPtr GetCudaAllocator(OrtDevice::DeviceId id) { // Current approach is not thread-safe, but there are some bigger infra pieces to put together in order to make // multi-threaded CUDA allocation work we need to maintain a per-thread CUDA allocator - static auto* id_to_allocator_map = new std::unordered_map(); + // We are leaking this map so we do not accidentally destroy CUDA Allocator instance + // after we unloaded CUDA provider library. Appeasing static analysis warning and using make_unique. + static auto* id_to_allocator_map = std::make_unique>().release(); - if (id_to_allocator_map->find(id) == id_to_allocator_map->end()) { + auto hit = id_to_allocator_map->find(id); + if (hit == id_to_allocator_map->end()) { // TODO: Expose knobs so that users can set fields associated with OrtArenaCfg so that we can pass it to the following method - id_to_allocator_map->insert({id, GetProviderInfo_CUDA().CreateCudaAllocator(id, gpu_mem_limit, arena_extend_strategy, external_allocator_info, nullptr)}); + auto cuda_allocator = GetProviderInfo_CUDA().CreateCudaAllocator(id, gpu_mem_limit, arena_extend_strategy, external_allocator_info, nullptr); + hit = id_to_allocator_map->emplace(id, std::move(cuda_allocator)).first; } - return (*id_to_allocator_map)[id]; + return hit->second; } std::unique_ptr GetGPUDataTransfer() {