diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
index ce458b60b4..0cc021cdd2 100644
--- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
+++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
@@ -292,7 +292,7 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
       zero_seq_index_cache_size = zero_seq_count * num_directions_;
       zero_seq_index_cache.resize(zero_seq_index_cache_size);
       for (int64_t i = 0; i < zero_seq_count; ++i) {
-        zero_seq_index_cache[zero_seq_count + i] = static_cast<int32_t>(batch_size + zero_seq_index_cache[i]);
+        zero_seq_index_cache[static_cast<size_t>(zero_seq_count) + i] = static_cast<int32_t>(batch_size + zero_seq_index_cache[i]);
       }
     }
 
diff --git a/onnxruntime/core/providers/cuda/tensor/where.cc b/onnxruntime/core/providers/cuda/tensor/where.cc
index 76baca5c4d..0b0c1f4a1d 100644
--- a/onnxruntime/core/providers/cuda/tensor/where.cc
+++ b/onnxruntime/core/providers/cuda/tensor/where.cc
@@ -115,8 +115,8 @@ struct TernaryElementwisePreparation {
         auto offset = out_rank - rank;
         for (auto i = offset; i < out_rank; ++i) {
           // the stride for broadcast dimension is kept as 0
-          if (shape.GetDims()[i - offset] != 1) {
-            padded_strides[i] = pitches[i - offset];
+          if (shape.GetDims()[gsl::narrow_cast<size_t>(i) - offset] != 1) {
+            padded_strides[i] = pitches[gsl::narrow_cast<size_t>(i) - offset];
           }
         }
       }
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index 9ae26d2271..e4e26039e6 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -160,14 +160,18 @@ AllocatorPtr GetCudaAllocator(OrtDevice::DeviceId id) {
   // Current approach is not thread-safe, but there are some bigger infra pieces to put together in order to make
   // multi-threaded CUDA allocation work we need to maintain a per-thread CUDA allocator
 
-  static auto* id_to_allocator_map = new std::unordered_map<OrtDevice::DeviceId, AllocatorPtr>();
+  // We are leaking this map so we do not accidentally destroy CUDA Allocator instance
+  // after we unloaded CUDA provider library. Appeasing static analysis warning and using make_unique.
+  static auto* id_to_allocator_map = std::make_unique<std::unordered_map<OrtDevice::DeviceId, AllocatorPtr>>().release();
 
-  if (id_to_allocator_map->find(id) == id_to_allocator_map->end()) {
+  auto hit = id_to_allocator_map->find(id);
+  if (hit == id_to_allocator_map->end()) {
     // TODO: Expose knobs so that users can set fields associated with OrtArenaCfg so that we can pass it to the following method
-    id_to_allocator_map->insert({id, GetProviderInfo_CUDA().CreateCudaAllocator(id, gpu_mem_limit, arena_extend_strategy, external_allocator_info, nullptr)});
+    auto cuda_allocator = GetProviderInfo_CUDA().CreateCudaAllocator(id, gpu_mem_limit, arena_extend_strategy, external_allocator_info, nullptr);
+    hit = id_to_allocator_map->emplace(id, std::move(cuda_allocator)).first;
   }
 
-  return (*id_to_allocator_map)[id];
+  return hit->second;
 }
 
 std::unique_ptr<IDataTransfer> GetGPUDataTransfer() {