diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index f5d98495ef..f738038bb9 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -583,6 +583,8 @@ if (WIN32)
     string(APPEND CMAKE_CXX_FLAGS " /wd4127")
     # class needs to have dll-interface to be used by clients
     string(APPEND CMAKE_CXX_FLAGS " /wd4251")
+    # issued by thrust nonstandard extension used: nameless struct/union
+    string(APPEND CMAKE_CXX_FLAGS " /wd4201")
     if (onnxruntime_ENABLE_STATIC_ANALYSIS)
         string(APPEND CMAKE_CXX_FLAGS
             " /analyze:stacksize 131072"
@@ -795,6 +797,10 @@ if (onnxruntime_USE_CUDA)
   if (NOT WIN32)
     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --expt-relaxed-constexpr --compiler-options -fPIC")
   endif()
+  # Options passed to cudafe
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=bad_friend_decl\"")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=unsigned_compare_with_zero\"")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=expr_has_no_effect\"")
 endif()
 
 if (onnxruntime_USE_TENSORRT)
diff --git a/onnxruntime/core/common/threadpool.cc b/onnxruntime/core/common/threadpool.cc
index c264df14b4..b9a70c5d74 100644
--- a/onnxruntime/core/common/threadpool.cc
+++ b/onnxruntime/core/common/threadpool.cc
@@ -224,7 +224,7 @@ static ParallelForBlock CalculateParallelForBlock(const ptrdiff_t n, const Eigen
   // Calculate parallel efficiency as fraction of total CPU time used for
   // computations:
   double max_efficiency =
-      static_cast<double>(block_count) / (Eigen::divup<int>(block_count, num_threads) * num_threads);
+      static_cast<double>(block_count) / (Eigen::divup<ptrdiff_t>(block_count, num_threads) * num_threads);
 
   // Now try to increase block size up to max_block_size as long as it
   // doesn't decrease parallel efficiency.
@@ -245,7 +245,7 @@ static ParallelForBlock CalculateParallelForBlock(const ptrdiff_t n, const Eigen
     assert(coarser_block_count < prev_block_count);
     prev_block_count = coarser_block_count;
     const double coarser_efficiency =
-        static_cast<double>(coarser_block_count) / (Eigen::divup<int>(coarser_block_count, num_threads) * num_threads);
+        static_cast<double>(coarser_block_count) / (Eigen::divup<ptrdiff_t>(coarser_block_count, num_threads) * num_threads);
     if (coarser_efficiency + 0.01 >= max_efficiency) {
       // Taking it.
       block_size = coarser_block_size;
diff --git a/onnxruntime/core/framework/data_transfer.h b/onnxruntime/core/framework/data_transfer.h
index 798525149b..7278f067db 100644
--- a/onnxruntime/core/framework/data_transfer.h
+++ b/onnxruntime/core/framework/data_transfer.h
@@ -23,6 +23,8 @@ class IDataTransfer {
 class CPUDataTransfer : public IDataTransfer {
  public:
   CPUDataTransfer() = default;
+  // Dampen MSVC warning about not fully overriding CopyTensor
+  using IDataTransfer::CopyTensor;
   bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
   common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override;
 };
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 03f1fe97a2..f5baa91065 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -40,7 +40,7 @@ void convTransposeWithDynamicPadsShapeInference(InferenceContext& ctx) {
   }
 
   // first dim is the batch axis and the next is the number of channels.
-  size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
+  size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - size_t{2});
 
   std::vector<int64_t> dilations;
   if (getRepeatedAttribute(ctx, "dilations", dilations)) {
@@ -2046,7 +2046,7 @@ Example 4:
 
           // fill with zeros if needed to reach appropriate size
           if (pads_data.size() != 2 * static_cast<size_t>(input_rank))
-            pads_data.resize(2 * input_rank, 0);
+            pads_data.resize(size_t{2} * input_rank, 0);
 
           const auto& output_shape =
               ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
diff --git a/onnxruntime/core/graph/contrib_ops/range_schema_defs.cc b/onnxruntime/core/graph/contrib_ops/range_schema_defs.cc
index 2c749030a7..de16e91946 100644
--- a/onnxruntime/core/graph/contrib_ops/range_schema_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/range_schema_defs.cc
@@ -69,10 +69,10 @@ template <typename T>
 static int64_t CalcRangeDim(const TensorProto* startShapeInitializer,
                             const TensorProto* limitShapeInitializer,
                             const TensorProto* deltaShapeInitializer) {
-    T start = GetFirstElement<T>(startShapeInitializer);
-    T limit = GetFirstElement<T>(limitShapeInitializer);
-    T delta = GetFirstElement<T>(deltaShapeInitializer);
-    if (delta == T{0}) {
+    auto start = static_cast<double>(GetFirstElement<T>(startShapeInitializer));
+    auto limit = static_cast<double>(GetFirstElement<T>(limitShapeInitializer));
+    auto delta = static_cast<double>(GetFirstElement<T>(deltaShapeInitializer));
+    if (delta == 0) {
         fail_shape_inference("delta in Range operator can not be zero!");
     }
     return static_cast<int64_t>(ceil((1.0 * (limit - start)) / delta));
diff --git a/onnxruntime/core/providers/cuda/gpu_data_transfer.h b/onnxruntime/core/providers/cuda/gpu_data_transfer.h
index 0f3d4687eb..50ba91c441 100644
--- a/onnxruntime/core/providers/cuda/gpu_data_transfer.h
+++ b/onnxruntime/core/providers/cuda/gpu_data_transfer.h
@@ -22,6 +22,8 @@ class GPUDataTransfer : public IDataTransfer {
 
   bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
 
+  // Dumpen MSVC warning about not fully overriding
+  using IDataTransfer::CopyTensor;
   common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override;
 
   cudaStream_t GetStream(int queue_id) const {
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
index 044d297234..d87c0a3c3e 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
@@ -461,9 +461,9 @@ void ResizeNearestImpl(
     float cubic_coeff_a,
     CudaFunctionOriginalCoordinate transform_coordinate,
     CudaFunctionNearestPixel calc_nearest_pixel,
-    int64_t* prefix_dim_sum,
+    int64_t* /* prefix_dim_sum */,
     NearestMappingInfo* dims_mapping) {
-  int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
+  int blocksPerGrid = static_cast<int>(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
 
   bool could2d = rank >= 2 &&
                  transform_coordinate != GetDeviceOriginalCoordinateFunc(ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE) &&
@@ -472,7 +472,7 @@ void ResizeNearestImpl(
     int64_t output_height = output_shape[rank - 2];
     int64_t output_width = output_shape[rank - 1];
     fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 3] : fast_divmod(output_height * output_width);
-    int blocksPerDimsMappingGrid = (int)(ceil((output_height + output_width) / 32.0));
+    int blocksPerDimsMappingGrid = static_cast<int>(ceil((output_height + output_width) / 32.0));
 
     _ResizeNearestMappingKernel2D<T><<<blocksPerDimsMappingGrid, 32, 0>>>(
         input_shape[rank - 2], input_shape[rank - 1],