diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index f5d98495ef..f738038bb9 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -583,6 +583,8 @@ if (WIN32) string(APPEND CMAKE_CXX_FLAGS " /wd4127") # class needs to have dll-interface to be used by clients string(APPEND CMAKE_CXX_FLAGS " /wd4251") + # issued by thrust nonstandard extension used: nameless struct/union + string(APPEND CMAKE_CXX_FLAGS " /wd4201") if (onnxruntime_ENABLE_STATIC_ANALYSIS) string(APPEND CMAKE_CXX_FLAGS " /analyze:stacksize 131072" @@ -795,6 +797,10 @@ if (onnxruntime_USE_CUDA) if (NOT WIN32) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --expt-relaxed-constexpr --compiler-options -fPIC") endif() + # Options passed to cudafe + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=bad_friend_decl\"") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=unsigned_compare_with_zero\"") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=expr_has_no_effect\"") endif() if (onnxruntime_USE_TENSORRT) diff --git a/onnxruntime/core/common/threadpool.cc b/onnxruntime/core/common/threadpool.cc index c264df14b4..b9a70c5d74 100644 --- a/onnxruntime/core/common/threadpool.cc +++ b/onnxruntime/core/common/threadpool.cc @@ -224,7 +224,7 @@ static ParallelForBlock CalculateParallelForBlock(const ptrdiff_t n, const Eigen // Calculate parallel efficiency as fraction of total CPU time used for // computations: double max_efficiency = - static_cast(block_count) / (Eigen::divup(block_count, num_threads) * num_threads); + static_cast(block_count) / (Eigen::divup(block_count, num_threads) * num_threads); // Now try to increase block size up to max_block_size as long as it // doesn't decrease parallel efficiency. @@ -245,7 +245,7 @@ static ParallelForBlock CalculateParallelForBlock(const ptrdiff_t n, const Eigen assert(coarser_block_count < prev_block_count); prev_block_count = coarser_block_count; const double coarser_efficiency = - static_cast(coarser_block_count) / (Eigen::divup(coarser_block_count, num_threads) * num_threads); + static_cast(coarser_block_count) / (Eigen::divup(coarser_block_count, num_threads) * num_threads); if (coarser_efficiency + 0.01 >= max_efficiency) { // Taking it. block_size = coarser_block_size; diff --git a/onnxruntime/core/framework/data_transfer.h b/onnxruntime/core/framework/data_transfer.h index 798525149b..7278f067db 100644 --- a/onnxruntime/core/framework/data_transfer.h +++ b/onnxruntime/core/framework/data_transfer.h @@ -23,6 +23,8 @@ class IDataTransfer { class CPUDataTransfer : public IDataTransfer { public: CPUDataTransfer() = default; + // Dampen MSVC warning about not fully overriding CopyTensor + using IDataTransfer::CopyTensor; bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override; common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override; }; diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc index 03f1fe97a2..f5baa91065 100644 --- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc @@ -40,7 +40,7 @@ void convTransposeWithDynamicPadsShapeInference(InferenceContext& ctx) { } // first dim is the batch axis and the next is the number of channels. - size_t n_input_dims = static_cast(input_shape.dim_size() - 2); + size_t n_input_dims = static_cast(input_shape.dim_size() - size_t{2}); std::vector dilations; if (getRepeatedAttribute(ctx, "dilations", dilations)) { @@ -2046,7 +2046,7 @@ Example 4: // fill with zeros if needed to reach appropriate size if (pads_data.size() != 2 * static_cast(input_rank)) - pads_data.resize(2 * input_rank, 0); + pads_data.resize(size_t{2} * input_rank, 0); const auto& output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape(); diff --git a/onnxruntime/core/graph/contrib_ops/range_schema_defs.cc b/onnxruntime/core/graph/contrib_ops/range_schema_defs.cc index 2c749030a7..de16e91946 100644 --- a/onnxruntime/core/graph/contrib_ops/range_schema_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/range_schema_defs.cc @@ -69,10 +69,10 @@ template static int64_t CalcRangeDim(const TensorProto* startShapeInitializer, const TensorProto* limitShapeInitializer, const TensorProto* deltaShapeInitializer) { - T start = GetFirstElement(startShapeInitializer); - T limit = GetFirstElement(limitShapeInitializer); - T delta = GetFirstElement(deltaShapeInitializer); - if (delta == T{0}) { + auto start = static_cast(GetFirstElement(startShapeInitializer)); + auto limit = static_cast(GetFirstElement(limitShapeInitializer)); + auto delta = static_cast(GetFirstElement(deltaShapeInitializer)); + if (delta == 0) { fail_shape_inference("delta in Range operator can not be zero!"); } return static_cast(ceil((1.0 * (limit - start)) / delta)); diff --git a/onnxruntime/core/providers/cuda/gpu_data_transfer.h b/onnxruntime/core/providers/cuda/gpu_data_transfer.h index 0f3d4687eb..50ba91c441 100644 --- a/onnxruntime/core/providers/cuda/gpu_data_transfer.h +++ b/onnxruntime/core/providers/cuda/gpu_data_transfer.h @@ -22,6 +22,8 @@ class GPUDataTransfer : public IDataTransfer { bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override; + // Dumpen MSVC warning about not fully overriding + using IDataTransfer::CopyTensor; common::Status CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const override; cudaStream_t GetStream(int queue_id) const { diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu index 044d297234..d87c0a3c3e 100644 --- a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu @@ -461,9 +461,9 @@ void ResizeNearestImpl( float cubic_coeff_a, CudaFunctionOriginalCoordinate transform_coordinate, CudaFunctionNearestPixel calc_nearest_pixel, - int64_t* prefix_dim_sum, + int64_t* /* prefix_dim_sum */, NearestMappingInfo* dims_mapping) { - int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); + int blocksPerGrid = static_cast(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); bool could2d = rank >= 2 && transform_coordinate != GetDeviceOriginalCoordinateFunc(ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE) && @@ -472,7 +472,7 @@ void ResizeNearestImpl( int64_t output_height = output_shape[rank - 2]; int64_t output_width = output_shape[rank - 1]; fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 3] : fast_divmod(output_height * output_width); - int blocksPerDimsMappingGrid = (int)(ceil((output_height + output_width) / 32.0)); + int blocksPerDimsMappingGrid = static_cast(ceil((output_height + output_width) / 32.0)); _ResizeNearestMappingKernel2D<<>>( input_shape[rank - 2], input_shape[rank - 1],