diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index c210073c95..c4cb3ae7db 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -610,6 +610,11 @@ else()
       target_compile_options(libprotobuf-lite PRIVATE "-Wno-enum-constexpr-conversion")
     endif()
   endif()
+
+  # enable warning(s) that may not be on by default
+  if (HAS_SHORTEN_64_TO_32)
+    list(APPEND ORT_WARNING_FLAGS -Wshorten-64-to-32)
+  endif()
 endif()
 
 #names in this var must match the directory names under onnxruntime/core/providers
diff --git a/cmake/onnxruntime_config.h.in b/cmake/onnxruntime_config.h.in
index 4f5125569c..2aef9dcf20 100644
--- a/cmake/onnxruntime_config.h.in
+++ b/cmake/onnxruntime_config.h.in
@@ -3,23 +3,24 @@
 
 #pragma once
 
+#cmakedefine HAS_BITWISE_INSTEAD_OF_LOGICAL
+#cmakedefine HAS_CAST_FUNCTION_TYPE
+#cmakedefine HAS_CATCH_VALUE
+#cmakedefine HAS_CLASS_MEMACCESS
+#cmakedefine HAS_DEPRECATED_COPY
+#cmakedefine HAS_DEPRECATED_DECLARATIONS
+#cmakedefine HAS_FORMAT_TRUNCATION
+#cmakedefine HAS_IGNORED_ATTRIBUTES
+#cmakedefine HAS_MAYBE_UNINITIALIZED
+#cmakedefine HAS_MISSING_BRACES
+#cmakedefine HAS_NONNULL_COMPARE
+#cmakedefine HAS_PARENTHESES
+#cmakedefine HAS_REALLOCARRAY
+#cmakedefine HAS_SHORTEN_64_TO_32
+#cmakedefine HAS_TAUTOLOGICAL_POINTER_COMPARE
 #cmakedefine HAS_UNUSED_BUT_SET_PARAMETER
 #cmakedefine HAS_UNUSED_BUT_SET_VARIABLE
 #cmakedefine HAS_UNUSED_VARIABLE
-#cmakedefine HAS_CAST_FUNCTION_TYPE
-#cmakedefine HAS_PARENTHESES
 #cmakedefine HAS_USELESS_CAST
-#cmakedefine HAS_NONNULL_COMPARE
-#cmakedefine HAS_TAUTOLOGICAL_POINTER_COMPARE
-#cmakedefine HAS_CATCH_VALUE
-#cmakedefine HAS_MISSING_BRACES
-#cmakedefine HAS_IGNORED_ATTRIBUTES
-#cmakedefine HAS_DEPRECATED_COPY
-#cmakedefine HAS_CLASS_MEMACCESS
-#cmakedefine HAS_MAYBE_UNINITIALIZED
-#cmakedefine HAS_DEPRECATED_DECLARATIONS
-#cmakedefine HAS_FORMAT_TRUNCATION
-#cmakedefine HAS_BITWISE_INSTEAD_OF_LOGICAL
-#cmakedefine HAS_REALLOCARRAY
-#cmakedefine ORT_VERSION u8"@ORT_VERSION@"
 #cmakedefine ORT_BUILD_INFO u8"@ORT_BUILD_INFO@"
+#cmakedefine ORT_VERSION u8"@ORT_VERSION@"
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index a241db80dc..e02c8a71e0 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -1785,6 +1785,12 @@ if (onnxruntime_USE_XNNPACK)
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
             FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
   endif()
+
+  # TODO fix shorten-64-to-32 warnings
+  # there are some in builds where sizeof(size_t) != sizeof(int64_t), e.g., in 'ONNX Runtime Web CI Pipeline'
+  if (HAS_SHORTEN_64_TO_32 AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+    target_compile_options(onnxruntime_providers_xnnpack PRIVATE -Wno-error=shorten-64-to-32)
+  endif()
 endif()
 
 if (onnxruntime_USE_CANN)
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index a0fc2cbbcb..8eb346a117 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -112,7 +112,7 @@ if (onnxruntime_USE_NCCL)
 endif()
 
 if(APPLE)
-  set(ONNXRUNTIME_SO_LINK_FLAG "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/python/exported_symbols.lst")
+  set(ONNXRUNTIME_SO_LINK_FLAG "-Xlinker -exported_symbols_list -Xlinker ${ONNXRUNTIME_ROOT}/python/exported_symbols.lst")
 elseif(UNIX)
   if (onnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS)
     set(ONNXRUNTIME_SO_LINK_FLAG "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/python/version_script_expose_onnx_protobuf.lds -Xlinker --gc-sections")
@@ -223,7 +223,7 @@ if (MSVC)
   # Explicitly use the release version of the python library to make the project file consistent with this.
   target_link_libraries(onnxruntime_pybind11_state PRIVATE ${Python_LIBRARY_RELEASE})
 elseif (APPLE)
-  set_target_properties(onnxruntime_pybind11_state PROPERTIES LINK_FLAGS "${ONNXRUNTIME_SO_LINK_FLAG} -undefined dynamic_lookup")
+  set_target_properties(onnxruntime_pybind11_state PROPERTIES LINK_FLAGS "${ONNXRUNTIME_SO_LINK_FLAG} -Xlinker -undefined -Xlinker dynamic_lookup")
   set_target_properties(onnxruntime_pybind11_state PROPERTIES
     INSTALL_RPATH "@loader_path"
     BUILD_WITH_INSTALL_RPATH TRUE
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 71b2123d68..822ef79704 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -839,6 +839,12 @@ else()
   target_compile_options(onnxruntime_test_all PRIVATE "-Wno-parentheses")
 endif()
 
+# TODO fix shorten-64-to-32 warnings
+# there are some in builds where sizeof(size_t) != sizeof(int64_t), e.g., in 'ONNX Runtime Web CI Pipeline'
+if (HAS_SHORTEN_64_TO_32 AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+  target_compile_options(onnxruntime_test_all PRIVATE -Wno-error=shorten-64-to-32)
+endif()
+
 if (UNIX AND onnxruntime_USE_TENSORRT)
     set_property(TARGET onnxruntime_test_all APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
 endif()
diff --git a/include/onnxruntime/core/common/eigen_common_wrapper.h b/include/onnxruntime/core/common/eigen_common_wrapper.h
index 4515f80ffd..57599e0403 100644
--- a/include/onnxruntime/core/common/eigen_common_wrapper.h
+++ b/include/onnxruntime/core/common/eigen_common_wrapper.h
@@ -41,6 +41,14 @@
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable"
 #endif
 
+// eigen-src/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h:231:56: error: implicit conversion loses integer
+//   precision: 'uint64_t' (aka 'unsigned long long') to 'size_t' (aka 'unsigned long') [-Werror,-Wshorten-64-to-32]
+// next = wnext == kStackMask ? nullptr : &waiters_[wnext];
+//                                         ~~~~~~~~ ^~~~~
+#ifdef HAS_SHORTEN_64_TO_32
+#pragma GCC diagnostic ignored "-Wshorten-64-to-32"
+#endif
+
 #elif defined(_MSC_VER)
 // build\windows\debug\external\eigen3\unsupported\eigen\cxx11\src/Tensor/Tensor.h(76):
 // warning C4554: '&': check operator precedence for possible error; use parentheses to clarify precedence
diff --git a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
index 7f0046d137..542b9052d4 100644
--- a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
+++ b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
@@ -27,6 +27,13 @@
 #ifdef HAS_CLASS_MEMACCESS
 #pragma GCC diagnostic ignored "-Wclass-memaccess"
 #endif
+// eigen-src/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h:231:56: error: implicit conversion loses integer
+//   precision: 'uint64_t' (aka 'unsigned long long') to 'size_t' (aka 'unsigned long') [-Werror,-Wshorten-64-to-32]
+// next = wnext == kStackMask ? nullptr : &waiters_[wnext];
+//                                         ~~~~~~~~ ^~~~~
+#ifdef HAS_SHORTEN_64_TO_32
+#pragma GCC diagnostic ignored "-Wshorten-64-to-32"
+#endif
 #elif defined(_MSC_VER)
 #pragma warning(push)
 #pragma warning(disable : 4127)
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
index 97d62a81e7..205d94fae9 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
@@ -224,7 +224,7 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager* init_run_feeds_fetch
                                          gpt_subgraph_.has_decoder_masked_attention_));
 
   if (gpt_subgraph_.past_present_share_buffer_) {  // Reuse past and present
-    fetches.reserve(static_cast<int64_t>(gpt_subgraph_.GetFirstPresentOutputIndex()) + gpt_subgraph_.num_layers);
+    fetches.reserve(static_cast<size_t>(gpt_subgraph_.GetFirstPresentOutputIndex()) + gpt_subgraph_.num_layers);
     fetches.resize(gpt_subgraph_.GetFirstPresentOutputIndex(), OrtValue());
     for (int layer = 0; layer < gpt_subgraph_.num_layers; layer++) {
       int feed_idx = gpt_subgraph_.GetFirstPastInputIndex() + layer;
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
index 72db3b1d0b..14a0db57c4 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
@@ -259,7 +259,8 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
                                                              decoder_subgraph_.has_decoder_masked_attention_));
 
     if (decoder_subgraph_.past_present_share_buffer_) {
-      decoder_fetches.reserve(static_cast<int64_t>(decoder_subgraph_.GetFirstPresentOutputIndex()) + 2 * static_cast<int64_t>(decoder_subgraph_.num_layers));
+      decoder_fetches.reserve(static_cast<size_t>(decoder_subgraph_.GetFirstPresentOutputIndex()) +
+                              2 * static_cast<size_t>(decoder_subgraph_.num_layers));
       decoder_fetches.resize(decoder_subgraph_.GetFirstPresentOutputIndex(), OrtValue());
       for (int layer = 0; layer < 2 * decoder_subgraph_.num_layers; layer++) {
         int feed_idx = decoder_subgraph_.GetFirstPastInputIndex() + layer;
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
index e5bc01ef1f..198dec011c 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
@@ -251,7 +251,8 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
                                                              decoder_subgraph_.has_decoder_masked_attention_));
 
     if (decoder_subgraph_.past_present_share_buffer_) {
-      decoder_fetches.reserve(static_cast<int64_t>(decoder_subgraph_.GetFirstPresentOutputIndex()) + 2 * static_cast<int64_t>(decoder_subgraph_.num_layers));
+      decoder_fetches.reserve(static_cast<size_t>(decoder_subgraph_.GetFirstPresentOutputIndex()) +
+                              2 * static_cast<size_t>(decoder_subgraph_.num_layers));
       decoder_fetches.resize(decoder_subgraph_.GetFirstPresentOutputIndex(), OrtValue());
       for (int layer = 0; layer < 2 * decoder_subgraph_.num_layers; layer++) {
         int feed_idx = decoder_subgraph_.GetFirstPastInputIndex() + layer;
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
index 08a5a9fa1d..7e2e5b2129 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
@@ -224,7 +224,8 @@ void BeamSearchScorer::Finalize(ISequences& sequences,
     if (!sequence_scores.empty())
       sequence_scores_buffer = sequence_scores.subspan(batch_index * num_return_sequences_, num_return_sequences_);
 
-    beam_hyp.Output(num_return_sequences_, max_length_, batch_output, sequence_scores_buffer);
+    beam_hyp.Output(narrow<int>(num_return_sequences_), narrow<int>(max_length_), batch_output,
+                    sequence_scores_buffer);
   }
 }
 
diff --git a/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h
index f426b88ec6..4504b099e3 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h
@@ -229,7 +229,7 @@ Status GreedySearchGpt<T, ParametersT>::Execute(const FeedsFetchesManager* init_
   ORT_RETURN_IF_ERROR(CreateInitialFeeds(greedy_state.sequence_lengths, expanded_input_ids_in_cpu, feeds, buffer));
 
   if (gpt_subgraph_.past_present_share_buffer_) {  // Reuse past and present
-    fetches.reserve((int64_t)gpt_subgraph_.GetFirstPresentOutputIndex() + gpt_subgraph_.num_layers);
+    fetches.reserve(static_cast<size_t>(gpt_subgraph_.GetFirstPresentOutputIndex()) + gpt_subgraph_.num_layers);
     fetches.resize(gpt_subgraph_.GetFirstPresentOutputIndex(), OrtValue());
     for (int layer = 0; layer < gpt_subgraph_.num_layers; layer++) {
       int feed_idx = gpt_subgraph_.GetFirstPastInputIndex() + layer;
diff --git a/onnxruntime/core/framework/print_tensor_utils.h b/onnxruntime/core/framework/print_tensor_utils.h
index 9509ca2646..6bd4e2d3af 100644
--- a/onnxruntime/core/framework/print_tensor_utils.h
+++ b/onnxruntime/core/framework/print_tensor_utils.h
@@ -139,9 +139,9 @@ void PrintCpuTensor(const Tensor& tensor, int threshold = kDefaultSnippetThresho
   bool is_snippet = (threshold > 0 && static_cast<int64_t>(threshold) < num_items);
   size_t num_dims = shape.NumDimensions();
   if (num_dims >= 3) {
-    int dim0 = static_cast<int>(shape.SizeToDimension(num_dims - 2));
-    int dim1 = static_cast<int>(shape[num_dims - 2]);
-    int dim2 = static_cast<int>(shape[num_dims - 1]);
+    int64_t dim0 = shape.SizeToDimension(num_dims - 2);
+    int64_t dim1 = shape[num_dims - 2];
+    int64_t dim2 = shape[num_dims - 1];
     if (is_snippet) {
       PrintCpuTensorSnippet<T>(data, dim0, dim1, dim2, edge_items);
     } else {
@@ -150,11 +150,11 @@ void PrintCpuTensor(const Tensor& tensor, int threshold = kDefaultSnippetThresho
     return;
   }
 
-  size_t num_rows = 1;
+  int64_t num_rows = 1;
   if (num_dims > 1) {
-    num_rows = static_cast<size_t>(shape[0]);
+    num_rows = shape[0];
   }
-  size_t row_size = num_items / num_rows;
+  int64_t row_size = num_items / num_rows;
 
   if (is_snippet) {
     PrintCpuTensorSnippet<T>(data, num_rows, row_size, edge_items);
diff --git a/onnxruntime/core/framework/stream_execution_context.h b/onnxruntime/core/framework/stream_execution_context.h
index 1815e0d122..92a7b4fa5b 100644
--- a/onnxruntime/core/framework/stream_execution_context.h
+++ b/onnxruntime/core/framework/stream_execution_context.h
@@ -46,7 +46,9 @@ class StreamExecutionContext {
       return v_.fetch_sub(1, std::memory_order_relaxed) == 1;
     }
 
-    int32_t Get() { return v_.load(std::memory_order_relaxed); }
+    int32_t Get() {
+      return gsl::narrow_cast<int32_t>(v_.load(std::memory_order_relaxed));
+    }
 
     void Inc() {
       ++v_;
diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc
index bcc21c5f03..8d970d2f03 100644
--- a/onnxruntime/core/platform/posix/env.cc
+++ b/onnxruntime/core/platform/posix/env.cc
@@ -355,7 +355,7 @@ class PosixEnv : public Env {
         micros -= static_cast<int64_t>(sleep_time.tv_sec) * OneMillion;
       }
       if (micros < OneMillion) {
-        sleep_time.tv_nsec = 1000 * micros;
+        sleep_time.tv_nsec = static_cast<decltype(timespec::tv_nsec)>(1000 * micros);
         micros = 0;
       }
       while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR) {
@@ -457,9 +457,9 @@ class PosixEnv : public Env {
       return Status::OK();
     }
 
-    static const long page_size = sysconf(_SC_PAGESIZE);
+    static const size_t page_size = narrow<size_t>(sysconf(_SC_PAGESIZE));
     const FileOffsetType offset_to_page = offset % static_cast<FileOffsetType>(page_size);
-    const size_t mapped_length = length + offset_to_page;
+    const size_t mapped_length = length + static_cast<size_t>(offset_to_page);
     const FileOffsetType mapped_offset = offset - offset_to_page;
     void* const mapped_base =
         mmap(nullptr, mapped_length, PROT_READ | PROT_WRITE, MAP_PRIVATE, file_descriptor.Get(), mapped_offset);
diff --git a/onnxruntime/core/providers/cpu/ml/linearclassifier.cc b/onnxruntime/core/providers/cpu/ml/linearclassifier.cc
index c964a98cf2..943e911341 100644
--- a/onnxruntime/core/providers/cpu/ml/linearclassifier.cc
+++ b/onnxruntime/core/providers/cpu/ml/linearclassifier.cc
@@ -35,7 +35,7 @@ LinearClassifier::LinearClassifier(const OpKernelInfo& info)
     ORT_ENFORCE(!coefficients_.empty());
 
   using_strings_ = !classlabels_strings_.empty();
-  class_count_ = static_cast<int64_t>(intercepts_.size());
+  class_count_ = static_cast<ptrdiff_t>(intercepts_.size());
 }
 
 // Use GEMM for the calculations, with broadcasting of intercepts
diff --git a/onnxruntime/core/providers/cpu/nn/dropout_op.h b/onnxruntime/core/providers/cpu/nn/dropout_op.h
index 7add3e7fa2..7878f1b94c 100644
--- a/onnxruntime/core/providers/cpu/nn/dropout_op.h
+++ b/onnxruntime/core/providers/cpu/nn/dropout_op.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include "core/common/narrow.h"
 #include "core/framework/op_kernel.h"
 #include "core/framework/random_generator.h"
 #include <chrono>
@@ -56,10 +57,10 @@ Status Dropout<T1, T2>::Compute(OpKernelContext* context) const {
   auto Y_span = Y->MutableDataAsSpan<T1>();
   Tensor* mask = context->Output(1, X_shape);  // optional
   std::unique_ptr<bool[]> temp_mask_buffer{};  // temporary buffer to use if mask input is not provided
-  auto mask_span = [&X_shape, mask, &temp_mask_buffer]() {
+  auto mask_span = [X_size = narrow<size_t>(X_shape.Size()), mask, &temp_mask_buffer]() {
     if (mask) return mask->MutableDataAsSpan<bool>();
-    temp_mask_buffer = std::make_unique<bool[]>(X_shape.Size());
-    return gsl::make_span(temp_mask_buffer.get(), X_shape.Size());
+    temp_mask_buffer = std::make_unique<bool[]>(X_size);
+    return gsl::make_span(temp_mask_buffer.get(), X_size);
   }();
 
   ORT_ENFORCE(!mask || mask->Shape() == X_shape, "X and mask should have the same shape");
diff --git a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
index 2fce240ccc..e9fc8d857b 100644
--- a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
+++ b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
@@ -805,7 +805,7 @@ Status QLinearConv<ActType>::Compute(OpKernelContext* context) const {
             strides.data(),
             dilations.data(),
             pads.data(),
-            static_cast<int64_t>(kernel_rank),
+            static_cast<ptrdiff_t>(kernel_rank),
             static_cast<ActType*>(col_buffer.get()) + group_id * col_buffer_size,
             X_zero_point_value);
       }
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 69eac91501..da1266ec1d 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -12,6 +12,7 @@
 #include "core/common/inlined_containers.h"
 #include "core/common/logging/logging.h"
 #include "core/common/logging/severity.h"
+#include "core/common/narrow.h"
 #include "core/common/optional.h"
 #include "core/common/path_string.h"
 #include "core/framework/arena_extend_strategy.h"
@@ -95,7 +96,7 @@ void GetPyObjFromTensor(const Tensor& rtensor, py::object& obj,
   MLDataType dtype = rtensor.DataType();
   const int numpy_type = OnnxRuntimeTensorToNumpyType(dtype);
   obj = py::reinterpret_steal<py::object>(PyArray_SimpleNew(
-      shape.NumDimensions(), npy_dims.data(), numpy_type));
+      narrow<int>(shape.NumDimensions()), npy_dims.data(), numpy_type));
 
   void* out_ptr = static_cast<void*>(
       PyArray_DATA(reinterpret_cast<PyArrayObject*>(obj.ptr())));
@@ -1604,7 +1605,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
           if (is_arg_file_name) {
             OrtPybindThrowIfError(sess->GetSessionHandle()->Load(arg));
           } else {
-            OrtPybindThrowIfError(sess->GetSessionHandle()->Load(arg.data(), arg.size()));
+            OrtPybindThrowIfError(sess->GetSessionHandle()->Load(arg.data(), narrow<int>(arg.size())));
           }
         }
 
diff --git a/onnxruntime/test/onnx/tensorprotoutils.cc b/onnxruntime/test/onnx/tensorprotoutils.cc
index f7802ae577..91026bbe62 100644
--- a/onnxruntime/test/onnx/tensorprotoutils.cc
+++ b/onnxruntime/test/onnx/tensorprotoutils.cc
@@ -79,7 +79,7 @@ static void UnpackTensorWithRawData(const void* raw_data, size_t raw_data_length
 #define DEFINE_UNPACK_TENSOR(T, Type, field_name, field_size)                                             \
   template <>                                                                                             \
   void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t raw_data_len,           \
-                    /*out*/ T* p_data, int64_t expected_size) {                                           \
+                    /*out*/ T* p_data, size_t expected_size) {                                            \
     if (nullptr == p_data) {                                                                              \
       const size_t size = raw_data != nullptr ? raw_data_len : tensor.field_size();                       \
       if (size == 0) return;                                                                              \
@@ -92,7 +92,7 @@ static void UnpackTensorWithRawData(const void* raw_data, size_t raw_data_length
       UnpackTensorWithRawData(raw_data, raw_data_len, expected_size, p_data);                             \
       return;                                                                                             \
     }                                                                                                     \
-    if (tensor.field_size() != expected_size)                                                             \
+    if (static_cast<size_t>(tensor.field_size()) != expected_size)                                        \
       ORT_CXX_API_THROW(MakeString("corrupted protobuf data: tensor shape size(", expected_size,          \
                                    ") does not match the data size(", tensor.field_size(), ") in proto"), \
                         OrtErrorCode::ORT_FAIL);                                                          \
@@ -117,7 +117,7 @@ DEFINE_UNPACK_TENSOR(uint32_t, onnx::TensorProto_DataType_UINT32, uint64_data, u
 // doesn't support raw data
 template <>
 void UnpackTensor(const onnx::TensorProto& tensor, const void* /*raw_data*/, size_t /*raw_data_len*/,
-                  /*out*/ std::string* p_data, int64_t expected_size) {
+                  /*out*/ std::string* p_data, size_t expected_size) {
   if (nullptr == p_data) {
     if (tensor.string_data_size() == 0) return;
     ORT_CXX_API_THROW("", OrtErrorCode::ORT_INVALID_ARGUMENT);
@@ -126,7 +126,7 @@ void UnpackTensor(const onnx::TensorProto& tensor, const void* /*raw_data*/, siz
     ORT_CXX_API_THROW("", OrtErrorCode::ORT_INVALID_ARGUMENT);
   }
 
-  if (tensor.string_data_size() != expected_size)
+  if (static_cast<size_t>(tensor.string_data_size()) != expected_size)
     ORT_CXX_API_THROW(
         "UnpackTensor: the pre-allocate size does not match the size in proto", OrtErrorCode::ORT_FAIL);
 
@@ -139,7 +139,7 @@ void UnpackTensor(const onnx::TensorProto& tensor, const void* /*raw_data*/, siz
 }
 template <>
 void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t raw_data_len,
-                  /*out*/ bool* p_data, int64_t expected_size) {
+                  /*out*/ bool* p_data, size_t expected_size) {
   if (nullptr == p_data) {
     const size_t size = raw_data != nullptr ? raw_data_len : tensor.int32_data_size();
     if (size == 0) return;
@@ -153,7 +153,7 @@ void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t
     return UnpackTensorWithRawData(raw_data, raw_data_len, expected_size, p_data);
   }
 
-  if (tensor.int32_data_size() != expected_size)
+  if (static_cast<size_t>(tensor.int32_data_size()) != expected_size)
     ORT_CXX_API_THROW(
         "UnpackTensor: the pre-allocate size does not match the size in proto", OrtErrorCode::ORT_FAIL);
   for (int iter : tensor.int32_data()) {
@@ -164,7 +164,7 @@ void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t
 }
 template <>
 void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t raw_data_len,
-                  /*out*/ MLFloat16* p_data, int64_t expected_size) {
+                  /*out*/ MLFloat16* p_data, size_t expected_size) {
   if (nullptr == p_data) {
     const size_t size = raw_data != nullptr ? raw_data_len : tensor.int32_data_size();
     if (size == 0) return;
@@ -178,7 +178,7 @@ void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t
     return UnpackTensorWithRawData(raw_data, raw_data_len, expected_size, p_data);
   }
 
-  if (tensor.int32_data_size() != expected_size)
+  if (static_cast<size_t>(tensor.int32_data_size()) != expected_size)
     ORT_CXX_API_THROW(
         "UnpackTensor: the pre-allocate size does not match the size in proto", OrtErrorCode::ORT_FAIL);
 
@@ -197,7 +197,7 @@ void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t
 
 template <>
 void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t raw_data_len,
-                  /*out*/ BFloat16* p_data, int64_t expected_size) {
+                  /*out*/ BFloat16* p_data, size_t expected_size) {
   if (nullptr == p_data) {
     const size_t size = raw_data != nullptr ? raw_data_len : tensor.int32_data_size();
     if (size == 0)
@@ -213,7 +213,7 @@ void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t
     return UnpackTensorWithRawData(raw_data, raw_data_len, expected_size, p_data);
   }
 
-  if (tensor.int32_data_size() != expected_size)
+  if (static_cast<size_t>(tensor.int32_data_size()) != expected_size)
     ORT_CXX_API_THROW(
         "UnpackTensor: the pre-allocate size does not match the size in proto", OrtErrorCode::ORT_FAIL);
 
@@ -233,7 +233,7 @@ void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t
 #define DEFINE_UNPACK_TENSOR_FLOAT8(TYPE, ONNX_TYPE)                                                       \
   template <>                                                                                              \
   void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t raw_data_len,            \
-                    /*out*/ TYPE* p_data, int64_t expected_size) {                                         \
+                    /*out*/ TYPE* p_data, size_t expected_size) {                                          \
     if (nullptr == p_data) {                                                                               \
       const size_t size = raw_data != nullptr ? raw_data_len : tensor.int32_data_size();                   \
       if (size == 0)                                                                                       \
@@ -246,7 +246,7 @@ void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t
     if (raw_data != nullptr) {                                                                             \
       return UnpackTensorWithRawData(raw_data, raw_data_len, expected_size, p_data);                       \
     }                                                                                                      \
-    if (tensor.int32_data_size() != expected_size)                                                         \
+    if (static_cast<size_t>(tensor.int32_data_size()) != expected_size)                                    \
       ORT_CXX_API_THROW(                                                                                   \
           "UnpackTensor: the pre-allocate size does not match the size in proto", OrtErrorCode::ORT_FAIL); \
     constexpr int max_value = std::numeric_limits<uint8_t>::max();                                         \
@@ -360,9 +360,10 @@ ORT_API(void, OrtUninitializeBuffer, _In_opt_ void* input, size_t input_len, enu
   }
 }
 
-#define CASE_PROTO(X, Y)                                                                                       \
-  case onnx::TensorProto_DataType::TensorProto_DataType_##X:                                                   \
-    ::onnxruntime::test::UnpackTensor<Y>(tensor_proto, raw_data, raw_data_len, (Y*)preallocated, tensor_size); \
+#define CASE_PROTO(X, Y)                                                                      \
+  case onnx::TensorProto_DataType::TensorProto_DataType_##X:                                  \
+    ::onnxruntime::test::UnpackTensor<Y>(tensor_proto, raw_data, raw_data_len,                \
+                                         (Y*)preallocated, static_cast<size_t>(tensor_size)); \
     break;
 
 #define CASE_TYPE(X)                   \
@@ -466,7 +467,7 @@ Status TensorProtoToMLValue(const onnx::TensorProto& tensor_proto, const MemBuff
             deleter.param = new UnInitializeParam{preallocated, preallocated_size, ele_type};
           }
           ::onnxruntime::test::UnpackTensor<std::string>(tensor_proto, raw_data, raw_data_len,
-                                                         (std::string*)preallocated, tensor_size);
+                                                         (std::string*)preallocated, static_cast<size_t>(tensor_size));
           break;
         default: {
           std::ostringstream ostr;
diff --git a/onnxruntime/test/onnx/tensorprotoutils.h b/onnxruntime/test/onnx/tensorprotoutils.h
index 0ff6ef224d..cbfb1276ea 100644
--- a/onnxruntime/test/onnx/tensorprotoutils.h
+++ b/onnxruntime/test/onnx/tensorprotoutils.h
@@ -38,9 +38,9 @@ common::Status TensorProtoToMLValue(const onnx::TensorProto& input, const MemBuf
 
 template <typename T>
 void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t raw_data_len,
-                  /*out*/ T* p_data, int64_t expected_size);
+                  /*out*/ T* p_data, size_t expected_size);
 
 ONNXTensorElementDataType CApiElementTypeFromProtoType(int type);
 ONNXTensorElementDataType GetTensorElementType(const onnx::TensorProto& tensor_proto);
 }  // namespace test
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/perftest/performance_runner.cc b/onnxruntime/test/perftest/performance_runner.cc
index 99abd3bbf1..9f2cbcf6a2 100644
--- a/onnxruntime/test/perftest/performance_runner.cc
+++ b/onnxruntime/test/perftest/performance_runner.cc
@@ -32,6 +32,13 @@ using onnxruntime::Status;
 #ifdef HAS_CLASS_MEMACCESS
 #pragma GCC diagnostic ignored "-Wclass-memaccess"
 #endif
+// eigen-src/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h:231:56: error: implicit conversion loses integer
+//   precision: 'uint64_t' (aka 'unsigned long long') to 'size_t' (aka 'unsigned long') [-Werror,-Wshorten-64-to-32]
+// next = wnext == kStackMask ? nullptr : &waiters_[wnext];
+//                                         ~~~~~~~~ ^~~~~
+#ifdef HAS_SHORTEN_64_TO_32
+#pragma GCC diagnostic ignored "-Wshorten-64-to-32"
+#endif
 #endif
 #include <unsupported/Eigen/CXX11/ThreadPool>
 #if defined(__GNUC__)
diff --git a/orttraining/orttraining/core/framework/tensorboard/event_writer.cc b/orttraining/orttraining/core/framework/tensorboard/event_writer.cc
index f1d57bb01e..9c0c351aee 100644
--- a/orttraining/orttraining/core/framework/tensorboard/event_writer.cc
+++ b/orttraining/orttraining/core/framework/tensorboard/event_writer.cc
@@ -2,6 +2,8 @@
 // Licensed under the MIT License.
 
 #include "orttraining/core/framework/tensorboard/event_writer.h"
+
+#include "onnxruntime_config.h"
 #include "orttraining/core/framework/tensorboard/crc32c.h"
 #include "core/platform/env.h"
 
@@ -13,6 +15,9 @@
 #if defined(__GNUC__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
+#if defined(HAS_SHORTEN_64_TO_32)
+#pragma GCC diagnostic ignored "-Wshorten-64-to-32"
+#endif
 #endif
 #include "tensorboard/compat/proto/event.pb.h"
 #if defined(__GNUC__)
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 8a7c109f69..be8ba37264 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -13,13 +13,6 @@ import subprocess
 import sys
 from pathlib import Path
 
-try:
-    from packaging.version import Version as LooseVersion
-except ImportError:
-    # This is deprecated and will be removed in Python 3.12.
-    # See https://docs.python.org/3/library/distutils.html.
-    from distutils.version import LooseVersion  # pylint: disable=W4901
-
 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
 REPO_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "..", ".."))
 
@@ -53,13 +46,10 @@ class UsageError(BaseError):
 
 
 def _check_python_version():
-    # According to the BUILD.md, python 3.5+ is required:
-    # Python 2 is definitely not supported and it should be safer to consider
-    # it won't run with python 4:
-    if sys.version_info[0] != 3:  # noqa: YTT201
-        raise BuildError(f"Bad python major version: expecting python 3, found version '{sys.version}'")
-    if sys.version_info[1] < 6:  # noqa: YTT203
-        raise BuildError(f"Bad python minor version: expecting python 3.6+, found version '{sys.version}'")
+    if (sys.version_info.major, sys.version_info.minor) < (3, 7):
+        raise UsageError(
+            f"Invalid Python version. At least Python 3.7 is required. Actual Python version: {sys.version}"
+        )
 
 
 def _str_to_bool(s):
@@ -382,7 +372,11 @@ def parse_arguments():
         "--xcode_code_signing_identity", default="", help="The development identity used for code signing in Xcode"
     )
     parser.add_argument(
-        "--use_xcode", action="store_true", help="Use Xcode as cmake generator, this is only supported on MacOS."
+        "--use_xcode",
+        action="store_const",
+        const="Xcode",
+        dest="cmake_generator",
+        help="Use Xcode as cmake generator, this is only supported on MacOS. Equivalent to '--cmake_generator Xcode'.",
     )
     parser.add_argument(
         "--osx_arch",
@@ -551,7 +545,7 @@ def parse_arguments():
             "Xcode",
         ],
         default=None,
-        help="Specify the generator that CMake invokes. ",
+        help="Specify the generator that CMake invokes.",
     )
     parser.add_argument(
         "--enable_multi_device_test",
@@ -1183,19 +1177,6 @@ def generate_build_tree(
 
     if is_macOS() and not args.android:
         cmake_args += ["-DCMAKE_OSX_ARCHITECTURES=" + args.osx_arch]
-        if args.use_xcode:
-            cmake_ver = LooseVersion(subprocess.check_output(["cmake", "--version"]).decode("utf-8").split()[2])
-            xcode_ver = LooseVersion(
-                subprocess.check_output(["xcrun", "xcodebuild", "-version"]).decode("utf-8").split()[1]
-            )
-            # Requires Cmake 3.21.1+ for XCode 13+
-            # The legacy build system is not longer supported on XCode 13+
-            if xcode_ver >= LooseVersion("13") and cmake_ver < LooseVersion("3.21.1"):
-                raise BuildError("CMake 3.21.1+ required to use XCode 13+")
-            # Use legacy build system for old CMake [3.19, 3.21.1) which uses new build system by default
-            # CMake 3.18- use the legacy build system by default
-            if cmake_ver >= LooseVersion("3.19.0") and cmake_ver < LooseVersion("3.21.1"):
-                cmake_args += ["-T", "buildsystem=1"]
         if args.apple_deploy_target:
             cmake_args += ["-DCMAKE_OSX_DEPLOYMENT_TARGET=" + args.apple_deploy_target]
         # Code sign the binaries, if the code signing development identity and/or team id are provided
@@ -1225,13 +1206,14 @@ def generate_build_tree(
         cmake_args += ["-Donnxruntime_USE_SNPE=ON"]
 
     if args.ios:
+        if not args.cmake_generator == "Xcode":
+            raise BuildError("iOS build requires use of the Xcode CMake generator ('--cmake_generator Xcode').")
+
         needed_args = [
-            args.use_xcode,
             args.ios_sysroot,
             args.apple_deploy_target,
         ]
         arg_names = [
-            "--use_xcode            " + "<need use xcode to cross build iOS on MacOS>",  # noqa: ISC003
             "--ios_sysroot          " + "<the location or name of the macOS platform SDK>",  # noqa: ISC003
             "--apple_deploy_target  " + "<the minimum version of the target platform>",  # noqa: ISC003
         ]
@@ -1437,7 +1419,7 @@ def build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, targe
                     "/nodeReuse:False",
                     f"/p:CL_MPCount={num_parallel_jobs}",
                 ]
-            elif is_macOS() and args.use_xcode:
+            elif args.cmake_generator == "Xcode":
                 # CMake will generate correct build tool args for Xcode
                 cmd_args += ["--parallel", str(num_parallel_jobs)]
             else:
@@ -2456,11 +2438,10 @@ def main():
                 cmake_extra_args = ["-A", target_arch, "-T", toolset, "-G", args.cmake_generator]
             if args.enable_wcos:
                 cmake_extra_defines.append("CMAKE_USER_MAKE_RULES_OVERRIDE=wcos_rules_override.cmake")
-        elif args.cmake_generator is not None and not (is_macOS() and args.use_xcode):
+        elif args.cmake_generator is not None:
             cmake_extra_args += ["-G", args.cmake_generator]
-        elif is_macOS():
-            if args.use_xcode:
-                cmake_extra_args += ["-G", "Xcode"]
+
+        if is_macOS():
             if not args.ios and not args.android and args.osx_arch == "arm64" and platform.machine() == "x86_64":
                 if args.test:
                     log.warning("Cannot test ARM64 build on X86_64. Will skip test running after build.")