Switch GSL to MS GSL 4.0.0 (#13416)

2026-06-21 02:18:09 +00:00 · 2022-10-29 04:15:20 -07:00 · 2022-10-29 04:15:20 -07:00 · 2ecd1d6622
commit 2ecd1d6622
parent 7fbfbf789f
292 changed files with 1128 additions and 4486 deletions
--- a/cgmanifests/cgmanifest.json
+++ b/cgmanifests/cgmanifest.json
@ -28,8 +28,8 @@
         "component": {
            "type": "git",
            "git": {
-               "commitHash": "58123b93bd7f12d17ac0c46379a0f2c0255d9213",
-               "repositoryUrl": "https://github.com/martinmoene/gsl-lite.git"
+               "commitHash": "a3534567187d2edc428efd3f13466ff75fe5805c",
+               "repositoryUrl": "https://github.com/microsoft/gsl.git"
            }
         }
      },
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@ -193,7 +193,7 @@
      "component": {
        "type": "git",
        "git": {
-          "commitHash": "53495a2a7d6ba7e0691a7f3602e9a5324bba6e45",
+          "commitHash": "58d77fa8070e8cec2dc1ed015d66b454c8d78850",
          "repositoryUrl": "https://github.com/google/googletest.git"
        },
        "comments": "git submodule at cmake/external/googletest"
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -1016,27 +1016,14 @@ if (CPUINFO_SUPPORTED)
  endif()
 endif()

-# bounds checking behavior.
-# throw instead of calling terminate if there's a bounds checking violation.
-# we make it through via a handler so CUDA does not complain
-# The following -DGSL macros are recognized by gsl-lite along with -Dgsl macros
-# no bounds checking in release build so no perf cost
-# if we enable onnxruntime_DISABLE_EXCEPTIONS, gsl will terminate
-if (onnxruntime_DISABLE_EXCEPTIONS)
-  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DGSL_TERMINATE_ON_CONTRACT_VIOLATION")
-else()
-  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DGSL_THROW_ON_CONTRACT_VIOLATION")
-endif()
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DGSL_UNENFORCED_ON_CONTRACT_VIOLATION")
-set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DGSL_UNENFORCED_ON_CONTRACT_VIOLATION")
-set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} -DGSL_UNENFORCED_ON_CONTRACT_VIOLATION")
+include(gsl)

 include(eigen)

 #onnxruntime_EXTERNAL_LIBRARIES could contain onnx, onnx_proto,libprotobuf, cuda/cudnn,
 # dnnl/mklml, onnxruntime_codegen_tvm, tvm and pthread
 # pthread is always at the last
-set(onnxruntime_EXTERNAL_LIBRARIES onnx onnx_proto ${PROTOBUF_LIB} re2::re2)
+set(onnxruntime_EXTERNAL_LIBRARIES onnx onnx_proto ${PROTOBUF_LIB} re2::re2 ${GSL_TARGET})

 if(NOT onnxruntime_DISABLE_ABSEIL)
  set(ABSEIL_LIBS absl::inlined_vector absl::flat_hash_set
--- a/cmake/external/googletest
+++ b/cmake/external/googletest
@ -1 +1 @@
-Subproject commit 53495a2a7d6ba7e0691a7f3602e9a5324bba6e45
+Subproject commit 58d77fa8070e8cec2dc1ed015d66b454c8d78850
--- a/cmake/external/gsl.cmake
+++ b/cmake/external/gsl.cmake
@ -0,0 +1,16 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+include(FetchContent)
+
+FetchContent_Declare(
+    GSL
+    GIT_REPOSITORY https://github.com/microsoft/gsl
+    GIT_TAG a3534567187d2edc428efd3f13466ff75fe5805c  # v4.0.0
+    GIT_SHALLOW ON
+    )
+
+FetchContent_MakeAvailable(GSL)
+
+set(GSL_TARGET "Microsoft.GSL::GSL")
+set(GSL_INCLUDE_DIR "$<TARGET_PROPERTY:${GSL_TARGET},INTERFACE_INCLUDE_DIRECTORIES>")
--- a/cmake/external/gsl.natvis
+++ b/cmake/external/gsl.natvis
@ -1,14 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
-  <Type Name="gsl::span&lt;*&gt;">
-    <Intrinsic Name="_size" Expression="(last_ - first_)"/>
-    <DisplayString>{{ size={ _size() }}}</DisplayString>
-    <Expand>
-      <Item Name="[size]" ExcludeView="simple">_size()</Item>
-      <IndexListItems Condition="_size() &gt; 0">
-        <Size>_size()</Size>
-        <ValueNode>first_[$i]</ValueNode>
-      </IndexListItems>
-    </Expand>
-  </Type>
-</AutoVisualizer>
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@ -78,7 +78,7 @@ file(GLOB onnxruntime_common_src CONFIGURE_DEPENDS
 # Remove new/delete intercept. To deal with memory leaks
 # Use either non-mimalloc build OR use mimalloc built-in features.
 if(WIN32 AND onnxruntime_USE_MIMALLOC)
-    list(REMOVE_ITEM onnxruntime_common_src 
+    list(REMOVE_ITEM onnxruntime_common_src
    "${ONNXRUNTIME_ROOT}/core/platform/windows/debug_alloc.cc"
    "${ONNXRUNTIME_ROOT}/core/platform/windows/debug_alloc.h")
 endif()
@ -116,11 +116,6 @@ if(NOT onnxruntime_DISABLE_ABSEIL)
    target_sources(
        onnxruntime_common
        INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/external/${ABSEIL_NATVIS_FILE}>)
-    set(GSL_NATVIS_FILE "gsl.natvis")
-    target_sources(
-        onnxruntime_common
-        INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/external/${GSL_NATVIS_FILE}>
-    )
  endif()
 endif()

@ -131,7 +126,7 @@ target_include_directories(onnxruntime_common
    PUBLIC
        ${OPTIONAL_LITE_INCLUDE_DIR})

-target_link_libraries(onnxruntime_common safeint_interface Boost::mp11)
+target_link_libraries(onnxruntime_common safeint_interface Boost::mp11 ${GSL_TARGET})

 if(NOT WIN32)
  target_include_directories(onnxruntime_common PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/external/nsync/public")
--- a/cmake/onnxruntime_flatbuffers.cmake
+++ b/cmake/onnxruntime_flatbuffers.cmake
@ -9,7 +9,7 @@ file(GLOB onnxruntime_flatbuffers_srcs CONFIGURE_DEPENDS
 source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_flatbuffers_srcs})

 onnxruntime_add_static_library(onnxruntime_flatbuffers ${onnxruntime_flatbuffers_srcs})
-onnxruntime_add_include_to_target(onnxruntime_flatbuffers onnx flatbuffers)
+onnxruntime_add_include_to_target(onnxruntime_flatbuffers onnx flatbuffers ${GSL_TARGET})
 if(onnxruntime_ENABLE_INSTRUMENT)
  target_compile_definitions(onnxruntime_flatbuffers PUBLIC ONNXRUNTIME_ENABLE_INSTRUMENT)
 endif()
@ -41,4 +41,4 @@ namespace std { using ::getenv; }
 ]])
  target_compile_options(flatbuffers PRIVATE /FI${CMAKE_BINARY_DIR}/gdk_cstdlib_wrapper.h)
  target_compile_options(flatc PRIVATE /FI${CMAKE_BINARY_DIR}/gdk_cstdlib_wrapper.h)
-endif()
+endif()
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@ -502,6 +502,7 @@ endif()

 foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
    target_include_directories(${mlas_target} PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${MLAS_SRC_DIR})
+    onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})
 endforeach()
 set_target_properties(onnxruntime_mlas PROPERTIES FOLDER "ONNXRuntime")
 if (WIN32)
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@ -550,7 +550,7 @@ if (onnxruntime_USE_DNNL)
  add_dependencies(onnxruntime_providers_dnnl onnxruntime_providers_shared project_dnnl ${onnxruntime_EXTERNAL_DEPENDENCIES})
  target_include_directories(onnxruntime_providers_dnnl PRIVATE ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS} ${DNNL_INCLUDE_DIR} ${DNNL_OCL_INCLUDE_DIR})
  # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
-  target_link_libraries(onnxruntime_providers_dnnl PRIVATE dnnl ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 ${ABSEIL_LIBS})
+  target_link_libraries(onnxruntime_providers_dnnl PRIVATE dnnl ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 ${ABSEIL_LIBS} ${GSL_TARGET})
  install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/dnnl  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
  set_target_properties(onnxruntime_providers_dnnl PROPERTIES FOLDER "ONNXRuntime")
  set_target_properties(onnxruntime_providers_dnnl PROPERTIES LINKER_LANGUAGE CXX)
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@ -1097,6 +1097,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
  if (onnxruntime_BUILD_SHARED_LIB)
    onnxruntime_add_static_library(onnxruntime_mocked_allocator ${TEST_SRC_DIR}/util/test_allocator.cc)
    target_include_directories(onnxruntime_mocked_allocator PUBLIC ${TEST_SRC_DIR}/util/include)
+    target_link_libraries(onnxruntime_mocked_allocator PRIVATE ${GSL_TARGET})
    set_target_properties(onnxruntime_mocked_allocator PROPERTIES FOLDER "ONNXRuntimeTest")

    #################################################################
@ -1253,6 +1254,7 @@ else()
  onnxruntime_add_shared_library_module(custom_op_library ${TEST_SRC_DIR}/testdata/custom_op_library/custom_op_library.cc)
 endif()
 target_include_directories(custom_op_library PRIVATE ${REPO_ROOT}/include)
+target_link_libraries(custom_op_library PRIVATE ${GSL_TARGET})
 if(UNIX)
  if (APPLE)
    set(ONNXRUNTIME_CUSTOM_OP_LIB_LINK_FLAG "-Xlinker -dead_strip")
--- a/cmake/winml.cmake
+++ b/cmake/winml.cmake
@ -187,6 +187,7 @@ target_include_directories(winml_lib_telemetry PRIVATE ${winml_lib_telemetry_dir
 target_include_directories(winml_lib_telemetry PRIVATE ${winml_lib_common_dir}/inc)
 target_include_directories(winml_lib_telemetry PRIVATE ${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows)
 target_include_directories(winml_lib_telemetry PRIVATE ${REPO_ROOT}/winml)
+target_include_directories(winml_lib_telemetry PRIVATE ${GSL_INCLUDE_DIR})

 # Properties
 set_target_properties(winml_lib_telemetry
@ -264,6 +265,7 @@ target_include_directories(winml_lib_ort PRIVATE ${winml_lib_api_ort_dir})
 target_include_directories(winml_lib_ort PRIVATE ${winml_lib_common_dir}/inc)
 target_include_directories(winml_lib_ort PRIVATE ${ONNXRUNTIME_INCLUDE_DIR})
 target_include_directories(winml_lib_ort PRIVATE ${ONNXRUNTIME_ROOT})
+target_include_directories(winml_lib_ort PRIVATE ${GSL_INCLUDE_DIR})

 set_target_properties(winml_lib_ort
  PROPERTIES
@ -403,13 +405,13 @@ target_include_directories(winml_lib_image PRIVATE ${winml_lib_api_image_dir})
 target_include_directories(winml_lib_image PRIVATE ${winml_lib_common_dir}/inc)
 target_include_directories(winml_lib_image PRIVATE ${ONNXRUNTIME_ROOT})
 target_include_directories(winml_lib_image PRIVATE ${ONNXRUNTIME_INCLUDE_DIR})                                                        # for status.h
-target_include_directories(winml_lib_image PRIVATE ${REPO_ROOT}/cmake/external/gsl/include)
 target_include_directories(winml_lib_image PRIVATE ${REPO_ROOT}/cmake/external/onnx)
 target_include_directories(winml_lib_image PRIVATE ${REPO_ROOT}/cmake/external/protobuf/src)
 target_include_directories(winml_lib_image PRIVATE ${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows)
 target_include_directories(winml_lib_image PRIVATE ${REPO_ROOT}/cmake/external/flatbuffers/include)
 target_include_directories(winml_lib_image PRIVATE ${REPO_ROOT}/cmake/external/mp11/include)
 target_include_directories(winml_lib_image PRIVATE ${REPO_ROOT}/winml)
+target_include_directories(winml_lib_image PRIVATE ${GSL_INCLUDE_DIR})

 # Properties
 set_target_properties(winml_lib_image
@ -511,7 +513,6 @@ target_include_directories(winml_lib_api PRIVATE ${winml_lib_common_dir}/inc)

 target_include_directories(winml_lib_api PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 target_include_directories(winml_lib_api PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/date/include)
-target_include_directories(winml_lib_api PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/gsl/include)
 target_include_directories(winml_lib_api PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/onnx)

 target_include_directories(winml_lib_api PRIVATE ${ONNXRUNTIME_INCLUDE_DIR})
@ -521,11 +522,11 @@ target_include_directories(winml_lib_api PRIVATE ${ONNXRUNTIME_ROOT}/core/graph)
 target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/eigen)
 target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/onnx)
 target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/protobuf/src)
-target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/gsl/include)
 target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/SafeInt)
 target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/flatbuffers/include)
 target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/mp11/include)
 target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/winml)
+target_include_directories(winml_lib_api PRIVATE ${GSL_INCLUDE_DIR})

 # Properties
 set_target_properties(winml_lib_api
@ -606,7 +607,6 @@ target_include_directories(winml_lib_api_experimental PRIVATE ${winml_lib_common

 target_include_directories(winml_lib_api_experimental PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 target_include_directories(winml_lib_api_experimental PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/date/include)
-target_include_directories(winml_lib_api_experimental PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/gsl/include)
 target_include_directories(winml_lib_api_experimental PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/onnx)

 target_include_directories(winml_lib_api_experimental PRIVATE ${ONNXRUNTIME_INCLUDE_DIR})
@ -616,11 +616,11 @@ target_include_directories(winml_lib_api_experimental PRIVATE ${ONNXRUNTIME_ROOT
 target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/eigen)
 target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/onnx)
 target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/protobuf/src)
-target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/gsl/include)
 target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/SafeInt)
 target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/flatbuffers/include)
 target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/mp11/include)
 target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/winml)
+target_include_directories(winml_lib_api_experimental PRIVATE ${GSL_INCLUDE_DIR})

 # Properties
 set_target_properties(winml_lib_api_experimental
@ -692,8 +692,15 @@ target_include_directories(winml_lib_common PRIVATE ${winml_lib_api_dir})
 target_include_directories(winml_lib_common PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 target_include_directories(winml_lib_common PRIVATE ${winml_lib_common_dir}/inc)
 target_include_directories(winml_lib_common PRIVATE ${REPO_ROOT}/winml)
+target_include_directories(winml_lib_common PRIVATE ${GSL_INCLUDE_DIR})
 target_precompiled_header(winml_lib_common lib/Common/inc/pch.h)

+# Properties
+set_target_properties(winml_lib_common
+  PROPERTIES
+  FOLDER
+  ${target_folder})
+
 if (onnxruntime_USE_DML)
  target_add_dml(winml_lib_common)
 endif()
@ -762,7 +769,6 @@ target_include_directories(winml_dll PRIVATE ${winml_lib_common_dir}/inc)

 target_include_directories(winml_dll PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 target_include_directories(winml_dll PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/date/include)
-target_include_directories(winml_dll PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/gsl/include)
 target_include_directories(winml_dll PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/onnx)

 target_include_directories(winml_dll PRIVATE ${ONNXRUNTIME_INCLUDE_DIR})
@ -771,12 +777,12 @@ target_include_directories(winml_dll PRIVATE ${ONNXRUNTIME_ROOT})
 target_include_directories(winml_dll PRIVATE ${ONNXRUNTIME_ROOT}/core/graph)
 target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/onnx)
 target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/protobuf/src)
-target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/gsl/include)
 target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/eigen)
 target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/SafeInt)
 target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/flatbuffers/include)
 target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/mp11/include)
 target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/winml)
+target_include_directories(winml_dll PRIVATE ${GSL_INCLUDE_DIR})

 # Properties
 set_target_properties(winml_dll
--- a/cmake/winml_unittests.cmake
+++ b/cmake/winml_unittests.cmake
@ -183,7 +183,7 @@ add_dependencies(winml_test_common
  winml_api
  winml_dll
 )
-onnxruntime_add_include_to_target(winml_test_common onnx_proto)
+onnxruntime_add_include_to_target(winml_test_common onnx_proto ${GSL_TARGET})
 onnxruntime_add_static_library(winml_google_test_lib ${WINML_TEST_SRC_DIR}/common/googletest/main.cpp)
 set_winml_target_properties(winml_google_test_lib)

--- a/include/onnxruntime/core/common/common.h
+++ b/include/onnxruntime/core/common/common.h
@ -36,7 +36,6 @@
 #include "core/common/exceptions.h"
 #include "core/common/make_string.h"
 #include "core/common/status.h"
-#include "core/common/gsl_suppress.h"


 namespace onnxruntime {
--- a/include/onnxruntime/core/common/gsl.h
+++ b/include/onnxruntime/core/common/gsl.h
@ -0,0 +1,6 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "gsl/gsl"
--- a/include/onnxruntime/core/common/gsl_suppress.h
+++ b/include/onnxruntime/core/common/gsl_suppress.h
@ -1,15 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-#pragma once
-
-#ifndef GSL_SUPPRESS
-#if defined(__clang__) && !defined(__NVCC__)
-#define GSL_SUPPRESS(x) [[gsl::suppress("x")]]
-#else
-#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__NVCC__)
-#define GSL_SUPPRESS(x) [[gsl::suppress(x)]]
-#else
-#define GSL_SUPPRESS(x)
-#endif  // _MSC_VER
-#endif  // __clang__
-#endif
--- a/include/onnxruntime/core/common/logging/capture.h
+++ b/include/onnxruntime/core/common/logging/capture.h
@ -4,7 +4,7 @@
 #pragma once

 #include <cstdarg>
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 #include "core/common/common.h"
 #include "core/common/code_location.h"
 #include "core/common/logging/severity.h"
--- a/include/onnxruntime/core/common/narrow.h
+++ b/include/onnxruntime/core/common/narrow.h
@ -0,0 +1,77 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+// onnxruntime::narrow() is like gsl::narrow() but it is also available when exceptions are disabled.
+
+#if !defined(ORT_NO_EXCEPTIONS)
+
+#include "gsl/narrow"
+
+namespace onnxruntime {
+using gsl::narrow;
+}  // namespace onnxruntime
+
+#else  // ^^ !defined(ORT_NO_EXCEPTIONS) ^^ / vv defined(ORT_NO_EXCEPTIONS) vv
+
+#include <cstdio>     // std::fprintf
+#include <exception>  // std::terminate
+#include <type_traits>
+
+#include "gsl/util"  // gsl::narrow_cast
+
+namespace onnxruntime {
+
+namespace detail {
+[[noreturn]] inline void OnNarrowingError() noexcept {
+  std::fprintf(stderr, "%s", "narrowing error\n");
+  std::terminate();
+}
+}  // namespace detail
+
+// This implementation of onnxruntime::narrow was copied and adapted from:
+// https://github.com/microsoft/GSL/blob/a3534567187d2edc428efd3f13466ff75fe5805c/include/gsl/narrow
+
+// narrow() : a checked version of narrow_cast() that terminates if the cast changed the value
+template <class T, class U, typename std::enable_if<std::is_arithmetic<T>::value>::type* = nullptr>
+// clang-format off
+GSL_SUPPRESS(type.1) // NO-FORMAT: attribute
+                      // clang-format on
+    constexpr T narrow(U u) noexcept {
+  constexpr const bool is_different_signedness =
+      (std::is_signed<T>::value != std::is_signed<U>::value);
+
+  // clang-format off
+GSL_SUPPRESS(es.103) // NO-FORMAT: attribute // don't overflow
+GSL_SUPPRESS(es.104) // NO-FORMAT: attribute // don't underflow
+GSL_SUPPRESS(p.2) // NO-FORMAT: attribute // don't rely on undefined behavior
+  // clang-format on
+  const T t = gsl::narrow_cast<T>(u);  // While this is technically undefined behavior in some cases (i.e., if the source value is of floating-point type
+                                       // and cannot fit into the destination integral type), the resultant behavior is benign on the platforms
+                                       // that we target (i.e., no hardware trap representations are hit).
+
+  if (static_cast<U>(t) != u || (is_different_signedness && ((t < T{}) != (u < U{})))) {
+    detail::OnNarrowingError();
+  }
+
+  return t;
+}
+
+template <class T, class U, typename std::enable_if<!std::is_arithmetic<T>::value>::type* = nullptr>
+// clang-format off
+GSL_SUPPRESS(type.1) // NO-FORMAT: attribute
+                      // clang-format on
+    constexpr T narrow(U u) noexcept {
+  const T t = gsl::narrow_cast<T>(u);
+
+  if (static_cast<U>(t) != u) {
+    detail::OnNarrowingError();
+  }
+
+  return t;
+}
+
+}  // namespace onnxruntime
+
+#endif  // defined(ORT_NO_EXCEPTIONS)
--- a/include/onnxruntime/core/common/span_utils.h
+++ b/include/onnxruntime/core/common/span_utils.h
@ -3,23 +3,26 @@

 #pragma once

-#include <gsl/gsl>
+#include <algorithm>
+
+#include "core/common/gsl.h"

 namespace onnxruntime {
-// Inspired by Fekir's Blog https://fekir.info/post/span-the-missing-constructor/
+
+// AsSpan inspired by Fekir's Blog https://fekir.info/post/span-the-missing-constructor/
 // Used under MIT license

 // Use AsSpan for less typing on any container including initializer list to create a span
 // (unnamed, untyped initializer list does not automatically convert to gsl::span).
-// {1, 2, 3} as such does not have a type 
+// {1, 2, 3} as such does not have a type
 // (see https://scottmeyers.blogspot.com/2014/03/if-braced-initializers-have-no-type-why.html)
-// 
+//
 //   Example: AsSpan({1, 2, 3}) results in gsl::span<const int>
-// 
+//
 // The above would deduce to std::initializer_list<int> and the result is gsl::span<const int>
 //
 // AsSpan<int64_t>({1, 2, 3}) produces gsl::span<const int64_t>
-// 
+//
 // We can also do std::array<int64_t, 3>{1, 2, 3} that can be automatically converted to span
 // without memory allocation.
 //
@ -38,7 +41,7 @@ template <class C>
 constexpr auto AsSpan(C& c) {
  return details::AsSpanImpl(c.data(), c.size());
 }
- 
+
 template <class C>
 constexpr auto AsSpan(const C& c) {
  return details::AsSpanImpl(c.data(), c.size());
@ -64,7 +67,22 @@ constexpr auto AsSpan(const T (&arr)[N]) {
  return details::AsSpanImpl(arr, N);
 }

-template<class T>
+template <class T>
 inline gsl::span<const T> EmptySpan() { return gsl::span<const T>(); }

-}
+template <class U, class T>
+[[nodiscard]] inline gsl::span<U> ReinterpretAsSpan(gsl::span<T> src) {
+  // adapted from gsl-lite span::as_span():
+  // https://github.com/gsl-lite/gsl-lite/blob/4720a2980a30da085b4ddb4a0ea2a71af7351a48/include/gsl/gsl-lite.hpp#L4102-L4108
+  Expects(src.size_bytes() % sizeof(U) == 0);
+  return gsl::span<U>(reinterpret_cast<U*>(src.data()), src.size_bytes() / sizeof(U));
+}
+
+template <class T1, size_t Extent1, class T2, size_t Extent2>
+[[nodiscard]] inline bool SpanEq(gsl::span<T1, Extent1> a, gsl::span<T2, Extent2> b) {
+  static_assert(std::is_same_v<std::remove_const_t<T1>, std::remove_const_t<T2>>,
+                "T1 and T2 should be the same type except for const qualification");
+  return std::equal(a.begin(), a.end(), b.begin(), b.end());
+}
+
+}  // namespace onnxruntime
--- a/include/onnxruntime/core/common/status.h
+++ b/include/onnxruntime/core/common/status.h
@ -19,7 +19,7 @@ limitations under the License.
 #ifdef _WIN32
 #include <winerror.h>
 #endif
-#include "core/common/gsl_suppress.h"
+#include "core/common/gsl.h"
 namespace onnxruntime {
 namespace common {

--- a/include/onnxruntime/core/framework/data_types.h
+++ b/include/onnxruntime/core/framework/data_types.h
@ -9,7 +9,7 @@
 #include <type_traits>
 #include <map>
 #include <unordered_map>
-#include "core/common/gsl_suppress.h"
+#include "core/common/gsl.h"
 #include "core/common/common.h"
 #include "core/common/exceptions.h"
 #include "core/framework/endian.h"
--- a/include/onnxruntime/core/framework/float16.h
+++ b/include/onnxruntime/core/framework/float16.h
@ -8,7 +8,7 @@
 #endif

 #if !defined(__CUDACC__) && !defined(__HIPCC__)
-#include <gsl/gsl>
+#include "core/common/narrow.h"
 #endif

 #include "core/common/common.h"
@ -123,7 +123,7 @@ inline ORT_HOST_DEVICE bool operator<(const BFloat16& left, const BFloat16& righ
 // E.g 10_f16 or 10_b16
 #if !defined(__CUDACC__) && !defined(__HIPCC__)
 inline MLFloat16 operator"" _f16(unsigned long long int v) {
-  return MLFloat16(gsl::narrow<uint16_t>(v));
+  return MLFloat16(narrow<uint16_t>(v));
 }

 inline MLFloat16 operator"" _fp16(long double v) {
@ -131,7 +131,7 @@ inline MLFloat16 operator"" _fp16(long double v) {
 }

 inline BFloat16 operator"" _b16(unsigned long long int v) {
-  return BFloat16(gsl::narrow<uint16_t>(v), BFloat16::FromBits());
+  return BFloat16(narrow<uint16_t>(v), BFloat16::FromBits());
 }

 inline BFloat16 operator"" _bfp16(long double v) {
--- a/include/onnxruntime/core/framework/op_kernel.h
+++ b/include/onnxruntime/core/framework/op_kernel.h
@ -30,7 +30,7 @@
 #endif
 #include "onnx/onnx_pb.h"
 #include "onnx/onnx-operators_pb.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 namespace onnxruntime {
 class OpKernelContext;
 }
--- a/include/onnxruntime/core/framework/op_kernel_info.h
+++ b/include/onnxruntime/core/framework/op_kernel_info.h
@ -8,7 +8,7 @@
 #include "core/framework/ort_value.h"
 #include "core/framework/op_node_proto_helper.h"
 #include "core/graph/graph_viewer.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"

 namespace onnxruntime {

--- a/include/onnxruntime/core/framework/op_node_proto_helper.h
+++ b/include/onnxruntime/core/framework/op_node_proto_helper.h
@ -7,7 +7,7 @@
 #include "core/common/status.h"
 #include "core/framework/tensor_shape.h"
 #include "core/graph/graph_viewer.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #endif

 #ifdef __has_attribute
--- a/include/onnxruntime/core/framework/tensor.h
+++ b/include/onnxruntime/core/framework/tensor.h
@ -8,7 +8,7 @@
 #include <string>
 #include <vector>

-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "core/common/common.h"
 #include "core/framework/allocator.h"
 #include "core/framework/tensor_shape.h"
@ -189,7 +189,7 @@ class Tensor final {
    ORT_ENFORCE(utils::IsPrimitiveDataType<T>(dtype_), "Tensor type mismatch. ",
                "T ", "!=", dtype_);
    const T* data = reinterpret_cast<const T*>(static_cast<char*>(p_data_) + byte_offset_);
-    return gsl::make_span(data, static_cast<typename gsl::span<T>::index_type>(shape_.Size()));
+    return gsl::make_span(data, static_cast<typename gsl::span<T>::size_type>(shape_.Size()));
  }

  void* MutableDataRaw(MLDataType type) {
--- a/include/onnxruntime/core/framework/tensor_shape.h
+++ b/include/onnxruntime/core/framework/tensor_shape.h
@ -7,7 +7,7 @@
 #include <algorithm>
 #include <string>
 #include <cstring>
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 #include "onnxruntime_config.h"

 #ifndef DISABLE_ABSEIL
@ -29,6 +29,8 @@
 #endif
 #endif  // DISABLE_ABSEIL

+#include "core/common/span_utils.h"
+
 namespace onnxruntime {
 #ifdef __GNUC__
 #pragma GCC diagnostic push
@ -96,7 +98,7 @@ class TensorShape {
  int64_t operator[](size_t idx) const { return values_[idx]; }
  int64_t& operator[](size_t idx) { return values_[idx]; }

-  bool operator==(const TensorShape& other) const noexcept { return GetDims() == other.GetDims(); }
+  bool operator==(const TensorShape& other) const noexcept { return SpanEq(GetDims(), other.GetDims()); }
  bool operator!=(const TensorShape& other) const noexcept { return !(*this == other); }

  size_t NumDimensions() const noexcept {
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@ -29,12 +29,13 @@
 #pragma warning(pop)
 #endif

-#include "gsl/gsl"
+#include "core/common/gsl.h"

 #include "core/common/common.h"
 #include "core/common/const_pointer_container.h"
 #include "core/common/inlined_containers_fwd.h"
 #include "core/common/path.h"
+#include "core/common/span_utils.h"
 #include "core/common/status.h"
 #include "core/common/logging/logging.h"
 #include "core/graph/basic_types.h"
@ -935,8 +936,8 @@ class Graph {
                const NodeAttributes* attributes = nullptr,
                const std::string& domain = kOnnxDomain) {
    return AddNode(name, op_type, description,
-                   gsl::make_span(input_args.begin(), input_args.end()),
-                   gsl::make_span(output_args.begin(), output_args.end()),
+                   AsSpan(input_args),
+                   AsSpan(output_args),
                   attributes, domain);
  }

@ -949,7 +950,7 @@ class Graph {
                const std::string& domain = kOnnxDomain) {
    return AddNode(name, op_type, description,
                   input_args,
-                   gsl::make_span(output_args.begin(), output_args.end()),
+                   AsSpan(output_args),
                   attributes, domain);
  }

@ -961,7 +962,7 @@ class Graph {
                const NodeAttributes* attributes = nullptr,
                const std::string& domain = kOnnxDomain) {
    return AddNode(name, op_type, description,
-                   gsl::make_span(input_args.begin(), input_args.end()),
+                   AsSpan(input_args),
                   output_args,
                   attributes, domain);
  }
@ -1153,7 +1154,7 @@ class Graph {
  void SetInputs(gsl::span<const NodeArg* const> inputs);

  void SetInputs(std::initializer_list<const NodeArg*> inputs) {
-    SetInputs(gsl::make_span(inputs));
+    SetInputs(AsSpan(inputs));
  }

  const Model& GetModel() const {
@ -1171,7 +1172,7 @@ class Graph {
  void SetOutputs(gsl::span<const NodeArg* const> outputs);

  void SetOutputs(std::initializer_list<const NodeArg*> outputs) {
-    SetOutputs(gsl::make_span(outputs.begin(), outputs.end()));
+    SetOutputs(AsSpan(outputs));
  }

 #endif  // !defined(ORT_MINIMAL_BUILD)
@ -1232,7 +1233,7 @@ class Graph {
  }

  void UpdateConsumerNodes(const std::string& node_arg_name, std::initializer_list<Node*> nodes) {
-    UpdateConsumerNodes(node_arg_name, gsl::make_span(nodes));
+    UpdateConsumerNodes(node_arg_name, AsSpan(nodes));
  }

  /** During constant folding it may become possible to infer the shape for a node.
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@ -129,7 +129,6 @@ extern "C" {

 // Used in *.cc files. Almost as same as ORT_API_STATUS, except without ORT_MUST_USE_RESULT and ORT_EXPORT
 #define ORT_API_STATUS_IMPL(NAME, ...) \
-  GSL_SUPPRESS(r .11)                  \
  _Success_(return == 0) _Check_return_ _Ret_maybenull_ OrtStatusPtr ORT_API_CALL NAME(__VA_ARGS__) NO_EXCEPTION

 #define ORT_CLASS_RELEASE(X) void(ORT_API_CALL * Release##X)(_Frees_ptr_opt_ Ort##X * input)
--- a/onnxruntime/contrib_ops/cpu/attnlstm/attention_mechanism.h
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/attention_mechanism.h
@ -3,7 +3,7 @@

 #pragma once

-#include <gsl/gsl>
+#include "core/common/gsl.h"

 namespace onnxruntime {
 namespace contrib {
--- a/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc
@ -51,7 +51,7 @@ void AttentionWrapper<T>::ProcessOutput(const gsl::span<const T>& rnn_cell_outpu
  // Get the context which is calculated within attention mechanism.
  attention_mechanism_.Compute(rnn_cell_output, prev_alignments_, attn_context_, alignments_);
  if (attention_mechanism_.NeedPrevAlignment()) {
-    std::copy(alignments_.cbegin(), alignments_.cend(), prev_alignments_.begin());
+    std::copy(alignments_.begin(), alignments_.end(), prev_alignments_.begin());
  }

  if (has_attn_layer_) {
--- a/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc
@ -63,11 +63,11 @@ template <typename T>
 void BahdanauAttention<T>::PrepareMemory(
    const gsl::span<const T>& memory,
    const gsl::span<const int>& memory_sequence_lengths) {
-  std::copy(memory.cbegin(), memory.cend(), values_.begin());
+  std::copy(memory.begin(), memory.end(), values_.begin());
  if (memory_sequence_lengths.empty()) {
    std::fill(mem_seq_lengths_.begin(), mem_seq_lengths_.end(), max_memory_steps_);
  } else {
-    std::copy(memory_sequence_lengths.cbegin(), memory_sequence_lengths.cend(), mem_seq_lengths_.begin());
+    std::copy(memory_sequence_lengths.begin(), memory_sequence_lengths.end(), mem_seq_lengths_.begin());
  }

  for (int b = 0; b < batch_size_; b++) {
@ -145,7 +145,7 @@ void BahdanauAttention<T>::Compute(
      }
    }

-    SoftmaxInplace(gsl::span<T>{alignments, gsl::narrow_cast<gsl::index>(mem_steps)});
+    SoftmaxInplace(gsl::span<T>{alignments, gsl::narrow_cast<size_t>(mem_steps)});

    // Calculate the context
    auto outspan = output.subspan(b * memory_depth_);
--- a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc
@ -8,6 +8,7 @@

 #include "core/common/common.h"
 #include "core/common/logging/logging.h"
+#include "core/common/narrow.h"
 #include "core/platform/threadpool.h"
 #include "core/framework/allocator.h"
 //TODO: fix the warnings
@ -95,9 +96,9 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {

  auto& X_shape = X.Shape();

-  int seq_length = gsl::narrow<int>(X_shape[0]);
-  int batch_size = gsl::narrow<int>(X_shape[1]);
-  int input_size = gsl::narrow<int>(X_shape[2]);
+  int seq_length = narrow<int>(X_shape[0]);
+  int batch_size = narrow<int>(X_shape[1]);
+  int input_size = narrow<int>(X_shape[2]);

  // Processing attention wrapper
  constexpr int first_attn_input = 8;
@ -113,12 +114,12 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
      am_query_layer_weights, am_memory_layer_weights, am_v_weights, attn_memory, attn_memory_seq_lens, attn_layer_weights);
  ORT_RETURN_IF_ERROR(status);

-  const int max_memory_step = gsl::narrow<int>(attn_memory.Shape()[1]);
-  const int memory_depth = gsl::narrow<int>(am_memory_layer_weights.Shape()[1]);
-  const int am_attn_size = gsl::narrow<int>(am_memory_layer_weights.Shape()[2]);
-  const int query_depth = gsl::narrow<int>(am_query_layer_weights.Shape()[1]);  // it is equal to hidden_size
+  const int max_memory_step = narrow<int>(attn_memory.Shape()[1]);
+  const int memory_depth = narrow<int>(am_memory_layer_weights.Shape()[1]);
+  const int am_attn_size = narrow<int>(am_memory_layer_weights.Shape()[2]);
+  const int query_depth = narrow<int>(am_query_layer_weights.Shape()[1]);  // it is equal to hidden_size
  const bool has_attention_layer = attn_layer_weights != nullptr;
-  const int attn_layer_depth = has_attention_layer ? gsl::narrow<int>(attn_layer_weights->Shape()[2]) : 0;
+  const int attn_layer_depth = has_attention_layer ? narrow<int>(attn_layer_weights->Shape()[2]) : 0;
  const int attention_size = has_attention_layer ? attn_layer_depth : memory_depth;

  const gsl::span<const T> attn_layer_weights_span = (has_attention_layer) ? attn_layer_weights->DataAsSpan<T>() : gsl::span<const T>();
@ -202,7 +203,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {

  if (!output.empty() && !sequence_lens_span.empty()) {
    // clear tailing outputs
-    int32_t max_seq_this_batch = *std::max_element(sequence_lens_span.cbegin(), sequence_lens_span.cend());
+    int32_t max_seq_this_batch = *std::max_element(sequence_lens_span.begin(), sequence_lens_span.end());
    if (max_seq_this_batch >= 0 && max_seq_this_batch < seq_length) {
      auto start = max_seq_this_batch * hidden_output_size_per_direction * num_directions_;
      std::fill(output.begin() + start, output.end(), T{});
@ -424,8 +425,8 @@ static Status ValidateRnnInputsWithExtraInputFromState(
    }

    auto sequence_len_entries = sequence_lens->DataAsSpan<int>();
-    if (std::any_of(sequence_len_entries.cbegin(),
-                    sequence_len_entries.cend(),
+    if (std::any_of(sequence_len_entries.begin(),
+                    sequence_len_entries.end(),
                    [seq_length](int len) { return len <= 0 || len > seq_length; })) {
      return ORT_MAKE_STATUS(
          ONNXRUNTIME, INVALID_ARGUMENT,
@ -461,8 +462,8 @@ Status DeepCpuAttnLstmOp::ValidateInputs(
                           "Attention mechanism memory shape error! Expected: {", batch_size,
                           "}, actural: ", memory_shape);
  }
-  const int max_memory_step = gsl::narrow<int>(memory_shape[1]);
-  const int memory_depth = gsl::narrow<int>(memory_shape[2]);
+  const int max_memory_step = narrow<int>(memory_shape[1]);
+  const int memory_depth = narrow<int>(memory_shape[2]);
  if (attn_memory_seq_lens != nullptr) {
    auto memory_seq_lens_shape = attn_memory_seq_lens->Shape();
    if (memory_seq_lens_shape.NumDimensions() != 1 || memory_seq_lens_shape[0] != batch_size) {
@ -472,9 +473,9 @@ Status DeepCpuAttnLstmOp::ValidateInputs(
    }
    const gsl::span<const int> mem_seq_lens_span = attn_memory_seq_lens->DataAsSpan<int>();
    auto item_not_in_range = std::find_if(
-        mem_seq_lens_span.cbegin(), mem_seq_lens_span.cend(),
+        mem_seq_lens_span.begin(), mem_seq_lens_span.end(),
        [max_memory_step](int len) { return len <= 0 || len > max_memory_step; });
-    if (item_not_in_range != mem_seq_lens_span.cend()) {
+    if (item_not_in_range != mem_seq_lens_span.end()) {
      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                             "Attention mechanism memory sequence lengths value must in (0, ",
                             max_memory_step, "], while ", *item_not_in_range, " found!");
@ -490,7 +491,7 @@ Status DeepCpuAttnLstmOp::ValidateInputs(
                           "Attention memory layer weight shape error! Expected:{",
                           num_directions_, ",", memory_depth, ", am_attn_size}, Got:", memory_layer_shape);
  }
-  const int am_attn_size = gsl::narrow<int>(memory_layer_shape[2]);
+  const int am_attn_size = narrow<int>(memory_layer_shape[2]);

  // check query layer weights of [num_directions, query_depth(hidden_size of lstm), am_attn_size]
  auto query_layer_shape = am_query_layer_weights.Shape();
@ -525,7 +526,7 @@ Status DeepCpuAttnLstmOp::ValidateInputs(
                             "Attention layer weight shape error! Expected: {", num_directions_, ", ",
                             memory_depth + hidden_size_, ", aw_attn_size}. Got:", attn_layer_shape);
    }
-    aw_attn_size = gsl::narrow<int>(attn_layer_shape[2]);
+    aw_attn_size = narrow<int>(attn_layer_shape[2]);
  }

  auto status = ValidateRnnInputsWithExtraInputFromState(
--- a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h
@ -7,6 +7,7 @@

 #include "attention_wrapper.h"

+#include "core/common/narrow.h"
 #include "core/framework/op_kernel.h"
 #include "core/providers/cpu/rnn/rnn_helpers.h"

@ -30,7 +31,7 @@ class DeepCpuAttnLstmOp final : public OpKernel {

    int64_t int64_value;
    ORT_ENFORCE(info.GetAttr("hidden_size", &int64_value).IsOK() && int64_value > 0);
-    hidden_size_ = gsl::narrow<int>(int64_value);
+    hidden_size_ = narrow<int>(int64_value);

    // optional attributes
    std::vector<std::string> activation_func_names = info.GetAttrsOrDefault<std::string>("activations");
--- a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc
@ -162,7 +162,7 @@ void UniDirectionalAttnLstm<T>::LoadPeepholeWeights(const gsl::span<const T>& pe
  DumpMatrix("P[f]", peephole_weights.data() + (i++ * hidden_size_), 1, hidden_size_);

  auto copy_weight = [this, &peephole_weights](int offset, gsl::span<T>& out) {
-    typename gsl::span<const T>::const_iterator in_iter = peephole_weights.cbegin() + offset;
+    typename gsl::span<const T>::iterator in_iter = peephole_weights.begin() + offset;
    std::copy(in_iter, in_iter + hidden_size_, out.begin());
  };

@ -245,9 +245,9 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
  }

  // Calculate the max and min length
-  int32_t max_sequence_length = *std::max_element(sequence_lengths.cbegin(), sequence_lengths.cend());
-  int32_t min_sequence_length = std::min(seq_length_, *std::min_element(sequence_lengths.cbegin(),
-                                                                        sequence_lengths.cend()));
+  int32_t max_sequence_length = *std::max_element(sequence_lengths.begin(), sequence_lengths.end());
+  int32_t min_sequence_length = std::min(seq_length_, *std::min_element(sequence_lengths.begin(),
+                                                                        sequence_lengths.end()));

  ///**************************LSTM Calculations****************************/
  const int hidden_size_x4 = 4 * hidden_size_;
@ -255,9 +255,9 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,

  // apply the weights to all the inputs and save to output_IOFC
  ComputeGemm(total_rows, hidden_size_x4, input_size_, T{1.0},
-              inputs.cbegin(), inputs.cend(),
+              inputs.begin(), inputs.end(),
              input_size_,
-              input_weights.cbegin(), input_weights.cend(),  // W[iofc]^T
+              input_weights.begin(), input_weights.end(),  // W[iofc]^T
              input_size_ + attention_size_, T{0.0},
              output_iofc_.begin(), output_iofc_.end(),
              hidden_size_x4, ttp_);
@ -278,7 +278,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,

    // hidden state can be provided as input for first step, so need to special case that.
    // after the first step this will switch to the output from the previous step
-    span_T_const_iter previous_state = batched_hidden_state_one_step.cbegin();
+    span_T_const_iter previous_state = batched_hidden_state_one_step.begin();

    //run through steps sequentially
    for (int step = 0; step < max_sequence_length; step++) {
@ -293,9 +293,9 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,

      // Xt*(W[iofc]^T) = INPUTt * W[iofc]^T + At-1 * WA[iofc]
      ComputeGemm(batch_size_, hidden_size_x4, attention_size_, T{1.0},
-                  attention.cbegin(), attention.cend(),  // At-1
+                  attention.begin(), attention.end(),  // At-1
                  attention_size_,
-                  input_weights.cbegin() + input_size_, input_weights.cend(),  // WA[iofc]
+                  input_weights.begin() + input_size_, input_weights.end(),  // WA[iofc]
                  input_size_ + attention_size_, T{1.0},
                  step_out_IOFC, output_iofc_.end(),  // input contains Xt*(W[iofc]^T)
                  hidden_size_x4, ttp_);
@ -304,7 +304,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
      ComputeGemm(batch_size_, hidden_size_x4, hidden_size_, T{1.0},
                  previous_state, previous_state_end,  // Ht-1
                  hidden_size_,
-                  recurrent_weights.cbegin(), recurrent_weights.cend(),  // R[iofc]
+                  recurrent_weights.begin(), recurrent_weights.end(),  // R[iofc]
                  hidden_size_, T{1.0},
                  step_out_IOFC, output_iofc_.end(),  // input contains Xt*(W[iofc]^T)
                  hidden_size_x4, ttp_);
--- a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.h
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.h
@ -11,7 +11,7 @@
 #include "core/common/logging/logging.h"
 #include "core/framework/allocator.h"

-#include <gsl/gsl>
+#include "core/common/gsl.h"

 namespace onnxruntime {
 namespace contrib {
@ -69,7 +69,7 @@ class UniDirectionalAttnLstm {
  }

 private:
-  using span_T_const_iter = typename gsl::span<T>::const_iterator;
+  using span_T_const_iter = typename gsl::span<const T>::iterator;
  using span_T_iter = typename gsl::span<T>::iterator;

  void SetNumThreads();
--- a/onnxruntime/contrib_ops/cpu/bert/attention_helper.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_helper.h
@ -101,7 +101,7 @@ void PrepareMask(const int32_t* mask_index,
  bool is_raw_attention_mask = (nullptr != mask_index && mask_index_dims.size() == 2);
  bool has_mask_start_position = (nullptr != mask_index &&
                                  mask_index_dims.size() == 1 &&
-                                  static_cast<int>(mask_index_dims.at(0)) == 2 * batch_size);
+                                  static_cast<int>(mask_index_dims[0]) == 2 * batch_size);

  for (int b_i = 0; b_i < batch_size; b_i++) {
    // TODO: mask_index can be used in softmax to save some calculation.
--- a/onnxruntime/contrib_ops/cpu/bert/bifurcation_detector.h
+++ b/onnxruntime/contrib_ops/cpu/bert/bifurcation_detector.h
@ -27,8 +27,8 @@ class BifurcationDetector : public OpKernel {
    const Tensor* pred_tokens = context->Input<Tensor>(3);
    const auto* src_tokens_data = static_cast<const int64_t*>(src_tokens->DataRaw());
    const auto* cur_tokens_data = static_cast<const int64_t*>(cur_tokens->DataRaw());
-    int64_t src_tokens_len = src_tokens->Shape().GetDims().at(0);
-    int64_t cur_tokens_len = cur_tokens->Shape().GetDims().at(0);
+    int64_t src_tokens_len = src_tokens->Shape().GetDims()[0];
+    int64_t cur_tokens_len = cur_tokens->Shape().GetDims()[0];

    Tensor* out_tokens = nullptr;

@ -45,7 +45,7 @@ class BifurcationDetector : public OpKernel {
    } else {
      const auto* pred_tokens_data = static_cast<const int64_t*>(pred_tokens->DataRaw());
      const int64_t prev_suffix_match_idx_data = static_cast<const int64_t*>(prev_suffix_match_idx->DataRaw())[0];
-      int64_t pred_tokens_len = pred_tokens->Shape().GetDims().at(0);
+      int64_t pred_tokens_len = pred_tokens->Shape().GetDims()[0];
      // Find bifurcation index between prediction tokens, and source tokens
      // starting from previous suffix match index.
      ORT_ENFORCE(src_tokens_len >= prev_suffix_match_idx_data);
@ -70,7 +70,7 @@ class BifurcationDetector : public OpKernel {
    // Return the index of the start of the n-gram in source tokens.
    // No matching if found if src tokens contain multiple or zero matching n-grams.
    // Return -1.
-    int64_t tokens_len = out_tokens->Shape().GetDims().at(0);
+    int64_t tokens_len = out_tokens->Shape().GetDims()[0];
    int64_t min_gram = min_ngram_size_;
    int64_t max_gram = max_ngram_size_;
    int64_t suffix_idx = -1;
--- a/onnxruntime/contrib_ops/cpu/bert/ngram_repeat_block.h
+++ b/onnxruntime/contrib_ops/cpu/bert/ngram_repeat_block.h
@ -3,8 +3,9 @@

 #pragma once

-#include <core/common/safeint.h>
 #include "core/common/common.h"
+#include "core/common/narrow.h"
+#include "core/common/safeint.h"
 #include "core/framework/op_kernel.h"
 #include "core/platform/threadpool.h"

@ -36,7 +37,7 @@ class NGramRepeatBlock : public OpKernel {
    int64_t cur_len = input_ids_dims[1];
    ORT_ENFORCE(scores_dims[0] == batch_size);
    int64_t vocab_size = scores_dims[1];
-    
+
    if (cur_len + 1 < ngram_size_) {
      return Status::OK();
    }
@ -69,7 +70,7 @@ class NGramRepeatBlock : public OpKernel {

    concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
    concurrency::ThreadPool::TryParallelFor(
-        tp, gsl::narrow<std::ptrdiff_t>(batch_size) , static_cast<double>(cur_len * ngram_size_),
+        tp, narrow<std::ptrdiff_t>(batch_size), static_cast<double>(cur_len * ngram_size_),
        [&lambda](ptrdiff_t first, ptrdiff_t last) {
          for (auto b = static_cast<int64_t>(first), end = static_cast<int64_t>(last); b < end; ++b) {
            lambda(b);
--- a/onnxruntime/contrib_ops/cpu/crop.h
+++ b/onnxruntime/contrib_ops/cpu/crop.h
@ -6,7 +6,7 @@
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"

-#include "gsl/gsl"
+#include "core/common/gsl.h"

 namespace onnxruntime {
 namespace contrib {
--- a/onnxruntime/contrib_ops/cpu/image_scaler.h
+++ b/onnxruntime/contrib_ops/cpu/image_scaler.h
@ -3,8 +3,9 @@

 #pragma once

-#include <core/common/safeint.h>
 #include "core/common/common.h"
+#include "core/common/narrow.h"
+#include "core/common/safeint.h"
 #include "core/framework/op_kernel.h"
 #include "core/util/math_cpuonly.h"

@ -44,7 +45,7 @@ class ImageScaler final : public OpKernel {
    EigenArrayMap<T> Y_arr(Y->MutableData<T>(), SafeInt<size_t>(H) * W, SafeInt<size_t>(N) * C);

    for (int64_t nc = 0; nc < N * C; ++nc) {
-      Y_arr.col(gsl::narrow<size_t>(nc)) = scale_ * X_arr.col(gsl::narrow<size_t>(nc)) + bias_[gsl::narrow<size_t>(nc % C)];
+      Y_arr.col(narrow<size_t>(nc)) = scale_ * X_arr.col(narrow<size_t>(nc)) + bias_[narrow<size_t>(nc % C)];
    }
    return Status::OK();
  }
--- a/onnxruntime/contrib_ops/cpu/inverse.cc
+++ b/onnxruntime/contrib_ops/cpu/inverse.cc
@ -2,6 +2,7 @@
 // Licensed under the MIT License.

 #include "core/common/common.h"
+#include "core/common/narrow.h"
 #include "core/framework/op_kernel.h"
 #include "core/platform/threadpool.h"
 #include "core/util/math_cpuonly.h"
@ -41,8 +42,8 @@ struct Inverse::ComputeImpl {
    const auto* input_data = input->Data<T>() + batch_offset;
    auto* output_data = output->MutableData<T>() + batch_offset;

-    Eigen::Map<const MatrixT<T>> input_matrix(input_data, gsl::narrow<size_t>(rows), gsl::narrow<size_t>(cols));
-    Eigen::Map<MatrixT<T>> output_matrix(output_data, gsl::narrow<size_t>(rows), gsl::narrow<size_t>(cols));
+    Eigen::Map<const MatrixT<T>> input_matrix(input_data, narrow<size_t>(rows), narrow<size_t>(cols));
+    Eigen::Map<MatrixT<T>> output_matrix(output_data, narrow<size_t>(rows), narrow<size_t>(cols));
    output_matrix = input_matrix.inverse();
  }
 };
@ -56,8 +57,8 @@ struct Inverse::ComputeImpl<MLFloat16> {
    const auto* input_data = reinterpret_cast<const Eigen::half*>(input->Data<MLFloat16>() + batch_offset);
    auto* output_data = reinterpret_cast<Eigen::half*>(output->MutableData<MLFloat16>() + batch_offset);

-    Eigen::Map<const MatrixT<Eigen::half>> input_matrix(input_data, gsl::narrow<size_t>(rows), gsl::narrow<size_t>(cols));
-    Eigen::Map<MatrixT<Eigen::half>> output_matrix(output_data, gsl::narrow<size_t>(rows), gsl::narrow<size_t>(cols));
+    Eigen::Map<const MatrixT<Eigen::half>> input_matrix(input_data, narrow<size_t>(rows), narrow<size_t>(cols));
+    Eigen::Map<MatrixT<Eigen::half>> output_matrix(output_data, narrow<size_t>(rows), narrow<size_t>(cols));
    output_matrix = input_matrix.inverse();
  }
 };
@ -81,7 +82,7 @@ Status Inverse::Compute(OpKernelContext* ctx) const {
    t_disp.Invoke<ComputeImpl>(input, output, batch_num, rows, cols);
  };

-  concurrency::ThreadPool::TryBatchParallelFor(ctx->GetOperatorThreadPool(), gsl::narrow<size_t>(num_batches), std::move(fn), 0);
+  concurrency::ThreadPool::TryBatchParallelFor(ctx->GetOperatorThreadPool(), narrow<size_t>(num_batches), std::move(fn), 0);

  return Status::OK();
 }
--- a/onnxruntime/contrib_ops/cpu/math/sparse_dense_matmul.cc
+++ b/onnxruntime/contrib_ops/cpu/math/sparse_dense_matmul.cc
@ -4,6 +4,7 @@
 #if !defined(DISABLE_SPARSE_TENSORS)

 #include "core/framework/sparse_tensor.h"
+#include "core/common/narrow.h"
 #include "core/providers/cpu/math/gemm_matmul_common.h"
 #include "core/providers/cpu/math/matmul_helper.h"
 #include "core/util/math.h"
@ -120,9 +121,9 @@ struct SparseToDenseCoo {
    auto coo_view = A.AsCoo();
    const auto& ind_dims = coo_view.Indices().Shape().GetDims();
    ORT_RETURN_IF_NOT(ind_dims.size() == 2, "COO indices must be 2-D, got: ", ind_dims.size());
-    ConstEigenMatrixMapRowMajor<int64_t> a_indicies_map(coo_view.Indices().Data<int64_t>(), gsl::narrow<size_t>(ind_dims[0]), gsl::narrow<size_t>(ind_dims[1]));
-    ConstEigenMatrixMapRowMajor<T> map_b(B.Data<T>(), gsl::narrow<size_t>(b_dims[0]), gsl::narrow<size_t>(b_dims[1]));
-    EigenMatrixMapRowMajor<T> output_map(output.MutableData<T>(), gsl::narrow<size_t>(out_dims[0]), gsl::narrow<size_t>(out_dims[1]));
+    ConstEigenMatrixMapRowMajor<int64_t> a_indicies_map(coo_view.Indices().Data<int64_t>(), narrow<size_t>(ind_dims[0]), narrow<size_t>(ind_dims[1]));
+    ConstEigenMatrixMapRowMajor<T> map_b(B.Data<T>(), narrow<size_t>(b_dims[0]), narrow<size_t>(b_dims[1]));
+    EigenMatrixMapRowMajor<T> output_map(output.MutableData<T>(), narrow<size_t>(out_dims[0]), narrow<size_t>(out_dims[1]));
    output_map.setZero();

    const auto rhs_right = (ctx.trans_B) ? b_dims[0] : b_dims[1];
@ -139,8 +140,8 @@ struct SparseToDenseCoo {
      ORT_RETURN_IF_NOT(m < out_left, "COO m index: ", m, " is out of bounds of out_left: ", out_left);
      const T a_value = a_values[i];
      for (int64_t n = 0; n < rhs_right; ++n) {
-        const T b_value = (ctx.trans_B) ? map_b(gsl::narrow<size_t>(n), gsl::narrow<size_t>(k)) : map_b(gsl::narrow<size_t>(k), gsl::narrow<size_t>(n));
-        output_map(gsl::narrow<size_t>(m), gsl::narrow<size_t>(n)) += Mul(a_value, ctx.alpha, b_value);
+        const T b_value = (ctx.trans_B) ? map_b(narrow<size_t>(n), narrow<size_t>(k)) : map_b(narrow<size_t>(k), narrow<size_t>(n));
+        output_map(narrow<size_t>(m), narrow<size_t>(n)) += Mul(a_value, ctx.alpha, b_value);
      }
    }

--- a/onnxruntime/contrib_ops/cpu/maxpool_with_mask.h
+++ b/onnxruntime/contrib_ops/cpu/maxpool_with_mask.h
@ -7,6 +7,7 @@

 #pragma once
 #include "core/common/common.h"
+#include "core/common/narrow.h"
 #include "core/framework/op_kernel.h"
 #include "core/framework/tensor.h"
 #include "core/providers/cpu/nn/pool_base.h"
@ -229,7 +230,7 @@ class MaxpoolWithMask : public OpKernel, public PoolBase {
        int64_t y_step = pooled_height;
        const int64_t total_channels = x_shape[0] * channels;
        const int64_t total_mask_channels = m_shape[0] * m_shape[1];
-        RunMaxpoolLoop<MaxpoolWithMask1DTask<float>>(tp, gsl::narrow<size_t>(total_channels),
+        RunMaxpoolLoop<MaxpoolWithMask1DTask<float>>(tp, narrow<size_t>(total_channels),
                                                     {X_data, M_data, Y_data, x_step, y_step, pooled_height, stride_h(),
                                                      height, total_mask_channels, kernel_shape, pads});
        break;
@ -241,7 +242,7 @@ class MaxpoolWithMask : public OpKernel, public PoolBase {
        const int64_t total_channels = x_shape[0] * channels;
        const int64_t total_mask_channels = m_shape[0] * m_shape[1];
        RunMaxpoolLoop<MaxpoolWithMask2DTask<float>>(
-            tp, gsl::narrow<size_t>(total_channels),
+            tp, narrow<size_t>(total_channels),
            {X_data, M_data, Y_data, x_step, y_step, pooled_height, pooled_width, stride_h(), stride_w(), height, width,
             total_mask_channels, kernel_shape, pads});
        break;
@ -252,7 +253,7 @@ class MaxpoolWithMask : public OpKernel, public PoolBase {
        const int64_t total_channels = x_shape[0] * channels;
        const int64_t total_mask_channels = m_shape[0] * m_shape[1];
        RunMaxpoolLoop<MaxpoolWithMask3DTask<float>>(
-            tp, gsl::narrow<size_t>(total_channels),
+            tp, narrow<size_t>(total_channels),
            {X_data, M_data, Y_data, x_step, y_step, pooled_height, pooled_width, pooled_depth, stride_h(), stride_w(),
             stride_d(), height, width, depth, total_mask_channels, kernel_shape, pads});
        break;
--- a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
+++ b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
@ -1,8 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

-#include <core/common/safeint.h>
 #include "nchwc_ops.h"
+#include "core/common/narrow.h"
+#include "core/common/safeint.h"
 #include "core/mlas/inc/mlas.h"

 namespace onnxruntime {
@ -54,7 +55,7 @@ Status ReorderInput::Compute(OpKernelContext* context) const {
    // elements, so that operations involving a smaller number of channels will
    // process more rows per worker.
    constexpr ptrdiff_t worker_goal = 48 * 1024;
-    ptrdiff_t work_per_worker = std::max<ptrdiff_t>(worker_goal /  gsl::narrow<ptrdiff_t>(nchwc_channels), 1);
+    ptrdiff_t work_per_worker = std::max<ptrdiff_t>(worker_goal / narrow<ptrdiff_t>(nchwc_channels), 1);
    worker_count = std::max<ptrdiff_t>(total_work / work_per_worker, 1);
  } else {
    // Each iteration produces one spatial_size chunk of NCHWc blocks.
@ -258,27 +259,27 @@ std::vector<float> NchwcUpsample::ComputeInterpolation(int64_t input_length,
                                                       int64_t output_length,
                                                       int64_t scale) const {
  std::vector<float> interpolation;
-  interpolation.resize(gsl::narrow<size_t>(output_length));
+  interpolation.resize(narrow<size_t>(output_length));

  if (scale == 1) {
    // Identity map for unscaled.
    for (int64_t o = 0; o < output_length; o++) {
-      interpolation[gsl::narrow<size_t>(o)] = static_cast<float>(o);
+      interpolation[narrow<size_t>(o)] = static_cast<float>(o);
    }
  } else if (transformation_mode_ == TransformationMode::ALIGN_CORNERS) {
    for (int64_t o = 0; o < output_length; o++) {
-      interpolation[gsl::narrow<size_t>(o)] =
+      interpolation[narrow<size_t>(o)] =
          static_cast<float>(o) * static_cast<float>(input_length - 1) / static_cast<float>(output_length - 1);
    }
  } else if (transformation_mode_ == TransformationMode::HALF_PIXEL) {
    for (int64_t o = 0; o < output_length; o++) {
-      interpolation[gsl::narrow<size_t>(o)] =
+      interpolation[narrow<size_t>(o)] =
          std::max(0.0f, (static_cast<float>(o) + 0.5f) / static_cast<float>(scale) - 0.5f);
    }
  } else {
    // Default to TransformationMode::ASYMMETRIC.
    for (int64_t o = 0; o < output_length; o++) {
-      interpolation[gsl::narrow<size_t>(o)] = static_cast<float>(o) / static_cast<float>(scale);
+      interpolation[narrow<size_t>(o)] = static_cast<float>(o) / static_cast<float>(scale);
    }
  }

@ -353,7 +354,7 @@ Status NchwcUpsample::Compute(OpKernelContext* context) const {
              static_cast<size_t>(input_h),
              static_cast<size_t>(input_w),
              static_cast<size_t>(output_w),
-              interpolation_h[gsl::narrow<size_t>(row_index)],
+              interpolation_h[narrow<size_t>(row_index)],
              interpolation_w.data(),
              x_channel_base,
              y_row);
--- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc
@ -1,3 +1,4 @@
+#include "core/common/narrow.h"
 #include "core/providers/cpu/rnn/lstm_base.h"
 #include "core/providers/cpu/rnn/rnn_helpers.h"
 #include "core/providers/cpu/rnn/uni_directional_lstm.h"
@ -188,8 +189,8 @@ Status DynamicQuantizeLSTM::Compute(OpKernelContext* context) const {
  ZeroPointCheck(w_zp, W_zp_shape, is_W_signed, Input);
  ZeroPointCheck(r_zp, R_zp_shape, is_R_signed, Recurrent);

-  size_t W_scale_size = W_scale_shape.NumDimensions() == 2 ? gsl::narrow<size_t>(W_scale_shape[1]) : 1;
-  size_t R_scale_size = R_scale_shape.NumDimensions() == 2 ? gsl::narrow<size_t>(R_scale_shape[1]) : 1;
+  size_t W_scale_size = W_scale_shape.NumDimensions() == 2 ? narrow<size_t>(W_scale_shape[1]) : 1;
+  size_t R_scale_size = R_scale_shape.NumDimensions() == 2 ? narrow<size_t>(R_scale_shape[1]) : 1;

  QuantizationParameter quant_para_W_1(w_scale->Data<float>(),
                                       static_cast<const uint8_t*>(w_zp->DataRaw()),
--- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

+#include "core/common/narrow.h"
 #include "core/common/safeint.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/providers/cpu/math/element_wise_ops.h"
@ -102,7 +103,7 @@ Status MatMulIntegerToFloatBase::ComputeCommon(OpKernelContext* ctx,
    const float* b_scale_tensor_data = b_scale_tensor->Data<float>();

    if (is_b_scale_per_column) {
-      multipliers_per_column.reserve(gsl::narrow<size_t>(b_scale_tensor->Shape().Size()));
+      multipliers_per_column.reserve(narrow<size_t>(b_scale_tensor->Shape().Size()));
      std::transform(b_scale_tensor_data,
                     b_scale_tensor_data + b_scale_tensor->Shape().Size(),
                     std::back_inserter(multipliers_per_column),
@ -217,7 +218,7 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const {
  uint8_t* a_data_quant = static_cast<uint8_t*>(allocator->Alloc(SafeInt<size_t>(num_of_elements) * sizeof(uint8_t)));
  BufferUniquePtr a_buffer_quant_holder(a_data_quant, BufferDeleter(std::move(allocator)));

-  ParQuantizeLinear(a_data, a_data_quant, gsl::narrow<size_t>(num_of_elements), a_scale, a_zero_point, ctx->GetOperatorThreadPool());
+  ParQuantizeLinear(a_data, a_data_quant, narrow<size_t>(num_of_elements), a_scale, a_zero_point, ctx->GetOperatorThreadPool());

  bool is_b_scale_supported = IsBQuantParamSupported(b_scale_tensor->Shape(), b ? b->Shape() : b_shape_);
  ORT_RETURN_IF_ERROR(ComputeCommon(
--- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_activations.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_activations.cc
@ -4,6 +4,7 @@
 #include "qlinear_activations.h"
 #include "qlinear_lookup_table.h"

+#include "core/common/narrow.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/platform/threadpool.h"

@ -53,7 +54,7 @@ Status QLinearLookupBase<T>::ComputeBase(OpKernelContext* context, Transformer f
  const uint8_t* x_data = reinterpret_cast<const uint8_t*>(X.Data<T>());
  uint8_t* y_data = reinterpret_cast<uint8_t*>(Y.MutableData<T>());
  ThreadPool::TryParallelFor(
-      tp, gsl::narrow<std::ptrdiff_t>(N), TensorOpCost{1.0, 1.0, 1.0},
+      tp, narrow<std::ptrdiff_t>(N), TensorOpCost{1.0, 1.0, 1.0},
      [this, x_data, y_data, &table](std::ptrdiff_t first, std::ptrdiff_t last) {
        QLinearLookupTableTransform(
            x_data + first,
--- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_concat.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_concat.cc
@ -4,6 +4,7 @@
 #include "qlinear_concat.h"
 #include "qlinear_lookup_table.h"

+#include "core/common/narrow.h"
 #include "core/providers/common.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/platform/threadpool.h"
@ -158,9 +159,9 @@ Status QLinearConcat::Compute(OpKernelContext* ctx) const {
    uint8_t* output = static_cast<uint8_t*>(p.output_tensor->MutableDataRaw()) + initial_output_offset;
    for (int64_t cur_in_offset = 0; cur_in_offset < prep.num_elements; cur_in_offset += input_axis_pitch) {
      if (is_copy) {
-        memcpy(output, input + cur_in_offset, gsl::narrow<size_t>(input_axis_pitch));
+        memcpy(output, input + cur_in_offset, narrow<size_t>(input_axis_pitch));
      } else {
-        QLinearLookupTableTransform(input + cur_in_offset, table, output, gsl::narrow<size_t>(input_axis_pitch));
+        QLinearLookupTableTransform(input + cur_in_offset, table, output, narrow<size_t>(input_axis_pitch));
      }
      output += p.output_axis_pitch;
    }
--- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc
@ -2,6 +2,7 @@
 // Licensed under the MIT License.

 #include "qlinear_global_average_pool.h"
+#include "core/common/narrow.h"
 #include "core/util/math_cpuonly.h"
 #include "core/providers/common.h"
 #include "core/platform/threadpool.h"
@ -32,7 +33,7 @@ Status ComputeQLinearGlobalAvgPool(
      const T8Bits* input = (const T8Bits*)(x + (first * image_size));
      T8Bits* output = (T8Bits*)(y + first);
      std::vector<int32_t> acc_buffer(MlasQLinearSafePaddingElementCount(sizeof(int32_t), last - first));
-      MlasQLinearGlobalAveragePoolNchw(input, x_scale, x_zero_point, output, y_scale, y_zero_point, last - first, gsl::narrow<size_t>(image_size), acc_buffer.data());
+      MlasQLinearGlobalAveragePoolNchw(input, x_scale, x_zero_point, output, y_scale, y_zero_point, last - first, narrow<size_t>(image_size), acc_buffer.data());
    };
    concurrency::ThreadPool::TryParallelFor(
        tp, static_cast<std::ptrdiff_t>(N * C), {1.0 * image_size, 1.0, 8.0 * image_size}, worker);
@ -40,11 +41,11 @@ Status ComputeQLinearGlobalAvgPool(
    auto worker = [=](std::ptrdiff_t first, std::ptrdiff_t last) {
      const T8Bits* input = x + first * C * image_size;
      T8Bits* output = y + first * C;
-      std::vector<int32_t> acc_buffer(MlasQLinearSafePaddingElementCount(sizeof(int32_t), gsl::narrow<size_t>(C)));
-      std::vector<T8Bits> zero_buffer(MlasQLinearSafePaddingElementCount(sizeof(T8Bits), gsl::narrow<size_t>(C)), 0);
+      std::vector<int32_t> acc_buffer(MlasQLinearSafePaddingElementCount(sizeof(int32_t), narrow<size_t>(C)));
+      std::vector<T8Bits> zero_buffer(MlasQLinearSafePaddingElementCount(sizeof(T8Bits), narrow<size_t>(C)), 0);
      MlasQLinearGlobalAveragePoolNhwc(
          input, x_scale, x_zero_point, output, y_scale, y_zero_point,
-          last - first, gsl::narrow<size_t>(image_size), gsl::narrow<size_t>(C), gsl::narrow<size_t>(C), acc_buffer.data(), zero_buffer.data());
+          last - first, narrow<size_t>(image_size), narrow<size_t>(C), narrow<size_t>(C), acc_buffer.data(), zero_buffer.data());
    };
    concurrency::ThreadPool::TryParallelFor(
        tp, static_cast<std::ptrdiff_t>(N),
@ -79,11 +80,11 @@ Status QLinearGlobalAveragePool::Compute(OpKernelContext* context) const {

  int64_t N = x_shape[0];
  int64_t C = (channels_last_ ? x_shape.back() : x_shape[1]);
-  int64_t image_size = std::accumulate(x_shape.cbegin() + spatial_dim_start, x_shape.cbegin() + spatial_dim_end,
+  int64_t image_size = std::accumulate(x_shape.begin() + spatial_dim_start, x_shape.begin() + spatial_dim_end,
                                       1LL, std::multiplies<int64_t>());

  std::vector<int64_t> output_dims(x_shape.begin(), x_shape.end());
-  std::transform(x_shape.cbegin() + spatial_dim_start, x_shape.cbegin() + spatial_dim_end,
+  std::transform(x_shape.begin() + spatial_dim_start, x_shape.begin() + spatial_dim_end,
                 output_dims.begin() + spatial_dim_start, [](const int64_t&) { return int64_t{1}; });
  Tensor& Y = *context->Output(0, output_dims);

--- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.cc
@ -15,7 +15,7 @@

 #include "core/mlas/inc/mlas.h"
 #include "core/platform/threadpool.h"
-#include "gsl/gsl-lite.hpp"
+#include "core/common/gsl.h"

 namespace onnxruntime {
 namespace contrib {
--- a/onnxruntime/contrib_ops/cpu/tokenizer.cc
+++ b/onnxruntime/contrib_ops/cpu/tokenizer.cc
@ -2,6 +2,7 @@
 // Licensed under the MIT License.

 #include "core/common/common.h"
+#include "core/common/narrow.h"
 #include "core/common/utf8_util.h"
 #include "core/framework/tensor.h"
 #include "core/framework/op_kernel.h"
@ -473,10 +474,10 @@ Status Tokenizer::Compute(OpKernelContext* ctx) const {
  size_t C = 0;
  if (input_dims.size() == 1) {
    N = 1;
-    C = gsl::narrow<size_t>(input_dims[0]);
+    C = narrow<size_t>(input_dims[0]);
  } else if (input_dims.size() == 2) {
-    N = gsl::narrow<size_t>(input_dims[0]);
-    C = gsl::narrow<size_t>(input_dims[1]);
+    N = narrow<size_t>(input_dims[0]);
+    C = narrow<size_t>(input_dims[1]);
  } else {
    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
                  "Input dimensions are either [C] or [N][C] allowed");
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
@ -26,7 +26,7 @@
 #include "core/framework/TensorSeq.h"
 #include "core/framework/allocator.h"
 #include "core/framework/ort_value.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/beam_search.h"
 #include "contrib_ops/cpu/transformers/logits_processor.h"
 #include "contrib_ops/cpu/transformers/sequences.h"
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
@ -5,6 +5,8 @@

 #include "contrib_ops/cpu/transformers/beam_search_impl_base.h"

+#include "core/common/span_utils.h"
+
 namespace onnxruntime {
 namespace contrib {

@ -255,8 +257,8 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager& feeds_fetches_manage
      bool increase_position = (iteration_counter > 1);
      ORT_RETURN_IF_ERROR(UpdateFeeds(fetches, feeds, current_length,
                                      position_ids, increase_position,
-                                      beam_next_tokens.as_span<const int32_t>(),
-                                      beam_indices.as_span<const int32_t>()));
+                                      ReinterpretAsSpan<const int32_t>(beam_next_tokens),
+                                      ReinterpretAsSpan<const int32_t>(beam_indices)));
    }
    fetches.clear();
  }
@ -280,7 +282,7 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager& feeds_fetches_manage
  if (output_scores != nullptr) {
    gsl::span<float> target = output_scores->MutableDataAsSpan<float>();
    gsl::span<const float> source = gsl::span<const float>(beam_state.scores.data(), beam_state.scores.size());
-    assert(target.length() == source.length());
+    assert(target.size() == source.size());
    ORT_RETURN_IF_ERROR(this->device_copy_func_(target, source, nullptr, DeviceCopyDirection::deviceToDevice));
  }

--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
@ -3,6 +3,7 @@

 #pragma once

+#include "core/common/span_utils.h"
 #include "contrib_ops/cpu/transformers/generation_shared.h"  // for DEBUG_GENERATION
 #include "contrib_ops/cpu/transformers/beam_search_impl_base.h"
 #include "contrib_ops/cpu/transformers/subgraph_t5_encoder.h"
@ -214,7 +215,7 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
                                                cpu_state,
                                                iteration_counter));
    ++current_length;  // Increase sequence length after a new token is generated.
-    ORT_RETURN_IF_ERROR(decoder_subgraph_.CreateInitialFeeds(beam_next_tokens.as_span<const int32_t>(),
+    ORT_RETURN_IF_ERROR(decoder_subgraph_.CreateInitialFeeds(ReinterpretAsSpan<const int32_t>(beam_next_tokens),
                                                             this->implicit_inputs_,
                                                             encoder_feeds,
                                                             encoder_fetches,
@ -284,8 +285,8 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
          decoder_fetches,
          decoder_feeds,
          num_present_outputs,
-          beam_next_tokens.as_span<const int32_t>(),
-          beam_indices.as_span<const int32_t>(),
+          ReinterpretAsSpan<const int32_t>(beam_next_tokens),
+          ReinterpretAsSpan<const int32_t>(beam_indices),
          parameters->num_beams,
          decoder_subgraph_.GetFirstPastInputIndex(),
          decoder_subgraph_.GetFirstPresentOutputIndex(),
@ -316,7 +317,7 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
  if (output_scores != nullptr) {
    gsl::span<float> target = output_scores->MutableDataAsSpan<float>();
    gsl::span<const float> source = gsl::span<const float>(beam_state.scores.data(), beam_state.scores.size());
-    assert(target.length() == source.length());
+    assert(target.size() == source.size());
    ORT_RETURN_IF_ERROR(this->device_copy_func_(target, source, nullptr, DeviceCopyDirection::deviceToDevice));
  }

--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
@ -5,6 +5,7 @@
 #include <math.h>
 #include "core/common/common.h"
 #include "core/common/safeint.h"
+#include "core/common/span_utils.h"
 #include "core/framework/allocator.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
@ -188,7 +189,7 @@ void BeamSearchScorer::Process(ISequences* sequences,
        auto clone = hypothesis_buffer_.subspan(hypothesis_buffer_offset_, sequence_length);
        gsl::copy(src, clone);
        hypothesis_buffer_offset_ += static_cast<size_t>(sequence_length);
-        auto sequence = clone.template as_span<const int32_t>();
+        auto sequence = ReinterpretAsSpan<const int32_t>(clone);
        beam_hyp.Add(sequence, next_score);
      } else {
        // Add next predicted token since it is not eos_token.
@ -209,7 +210,7 @@ void BeamSearchScorer::Process(ISequences* sequences,
    //  Check if we are done so that we can save a pad step if all(done)
    if (!done_[batch]) {
      gsl::span<const float> topk_scores = next_scores.subspan(batch * num_beams_, top_k);
-      const float* best_sum_logprobs = std::max_element(topk_scores.begin(), topk_scores.end());
+      const auto best_sum_logprobs = std::max_element(topk_scores.begin(), topk_scores.end());
      if (beam_hyp.IsDone(*best_sum_logprobs, sequence_length)) {
        done_[batch] = true;
      }
--- a/onnxruntime/contrib_ops/cpu/transformers/generate_impl_base.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/generate_impl_base.h
@ -6,6 +6,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "core/common/span_utils.h"
 #include "contrib_ops/cpu/transformers/generation_shared.h"

 namespace onnxruntime {
@ -142,7 +143,7 @@ class GenerateBase {
        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                               "Input 'attention_mask' is expected to have 2 dimensions, got ", dims_attn.size());
      }
-      if (dims_attn != dims) {
+      if (!SpanEq(dims_attn, dims)) {
        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                               "Input 'attention_mask' is expected to have same shape as input_ids");
      }
--- a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc
@ -7,7 +7,7 @@
 #include "core/providers/cpu/math/top_k.h"
 #include "core/providers/cpu/math/softmax_shared.h"
 #include "core/common/safeint.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/sequences.h"
 #include "contrib_ops/cpu/transformers/beam_search_scorer.h"
 #include "contrib_ops/cpu/transformers/generation_device_helper.h"
@ -526,7 +526,7 @@ void PickGptPastState(const std::vector<OrtValue>& last_outputs,

    gsl::span<T> past_span = gsl::make_span<T>(past.GetMutable<Tensor>()->MutableData<T>(), past_shape.Size());
    gsl::span<const T> present_span = gsl::make_span<const T>(present.Get<Tensor>().Data<T>(), past_shape.Size());
-    for (gsl::index j = 0; j < beam_indices.length(); j++) {
+    for (size_t j = 0; j < beam_indices.size(); j++) {
      int32_t beam_index = beam_indices[j];
      gsl::span<const T> present_key = present_span.subspan(beam_index * block_size_per_beam, block_size_per_beam);
      gsl::span<const T> present_value = present_span.subspan(past_key_size + beam_index * block_size_per_beam,
@ -563,7 +563,7 @@ Status UpdateGptFeeds(
  // The following updates inputs for subgraph

  // Update input_ids with next tokens.
-  int batch_beam_size = static_cast<int>(beam_next_tokens.length());
+  int batch_beam_size = static_cast<int>(beam_next_tokens.size());
  int64_t dims[] = {batch_beam_size, 1};
  TensorShape input_ids_shape(&dims[0], 2);
  auto int32_type = DataTypeImpl::GetType<int32_t>();
@ -712,7 +712,7 @@ void PickT5PastState(const std::vector<OrtValue>& last_outputs,

    gsl::span<T> past_span = gsl::make_span<T>(past.GetMutable<Tensor>()->MutableData<T>(), past_shape.Size());
    gsl::span<const T> present_span = gsl::make_span<const T>(present.Get<Tensor>().Data<T>(), past_shape.Size());
-    for (gsl::index j = 0; j < beam_indices.length(); j++) {
+    for (size_t j = 0; j < beam_indices.size(); j++) {
      int32_t beam_index = beam_indices[j];
      gsl::span<const T> present_beam = present_span.subspan(beam_index * block_size_per_beam, block_size_per_beam);
      gsl::span<T> past_beam = past_span.subspan(j * block_size_per_beam, block_size_per_beam);
@ -750,7 +750,7 @@ Status UpdateDecoderFeeds(
  // Only need copy beam next tokens to input_ids, and copy present_*_self_* to past_*_self_*,

  // Update input_ids with next tokens.
-  int batch_beam_size = static_cast<int>(beam_next_tokens.length());
+  int batch_beam_size = static_cast<int>(beam_next_tokens.size());

  // TODO(tianleiwu): Reuse buffer for input_ids to reduce memory allocation.
  OrtValue input_ids;
--- a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h
@ -10,7 +10,7 @@
 #endif

 #include <vector>
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/logits_processor.h"
 #include "contrib_ops/cpu/transformers/generation_shared.h"

--- a/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h
@ -4,7 +4,7 @@
 #pragma once

 #include <utility>
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "core/framework/allocator.h"
 #include "core/framework/ort_value.h"

--- a/onnxruntime/contrib_ops/cpu/transformers/greedy_search.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/greedy_search.cc
@ -25,7 +25,7 @@
 #include "core/framework/session_options.h"
 #include "core/framework/TensorSeq.h"
 #include "core/framework/ort_value.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/greedy_search.h"
 #include "contrib_ops/cpu/transformers/logits_processor.h"
 #include "contrib_ops/cpu/transformers/sequences.h"
--- a/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h
@ -4,6 +4,8 @@
 #pragma once
 #include <algorithm>
 #include <vector>
+
+#include "core/common/span_utils.h"
 #include "contrib_ops/cpu/transformers/greedy_search_impl_base.h"

 namespace onnxruntime {
@ -219,7 +221,7 @@ Status GreedySearchGpt<T>::Execute(const FeedsFetchesManager& feeds_fetches_mana
      bool increase_position = (iteration_counter > 1);
      ORT_RETURN_IF_ERROR(UpdateFeeds(fetches, feeds, current_length,
                                      position_ids, increase_position,
-                                      next_tokens.as_span<const int32_t>()));
+                                      ReinterpretAsSpan<const int32_t>(next_tokens)));
    }
    fetches.clear();
  }
--- a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
@ -3,7 +3,9 @@

 #include <memory>
 #include <assert.h>
+#include "core/common/narrow.h"
 #include "core/common/safeint.h"
+#include "core/common/span_utils.h"
 #include "contrib_ops/cpu/transformers/logits_processor.h"
 #include "contrib_ops/cpu/transformers/dump_tensor.h"

@ -100,15 +102,15 @@ void NoRepeatNGramLogitsProcessor<T>::Process(const ISequences* sequences,
    gsl::span<T> beam_token_scores = next_token_scores.GetScores(i);
    gsl::span<const int32_t> sequence = sequences->GetSequence(i);

-    gsl::span<const int32_t> prefix = sequence.subspan(sequence.length() - prefix_length);
-    ORT_ENFORCE(prefix.length() == prefix_length);
+    gsl::span<const int32_t> prefix = sequence.subspan(sequence.size() - prefix_length);
+    ORT_ENFORCE(prefix.size() == narrow<size_t>(prefix_length));

    std::unordered_set<int32_t> blocked_word_ids;
-    for (int j = 0; j <= static_cast<int>(sequence.length()) - ngram_size_; j++) {
+    for (int j = 0; j <= static_cast<int>(sequence.size()) - ngram_size_; j++) {
      // Here we use naive algorithm for matching. The complexity is O(batch_beam_size * ngram_size * sequence_length)
      // TODO(tianleiwu): build N-Gram index (hash table with prefix of length NGram - 1 as key,
      //                  and list of last word of NGram as value) for fast matching.
-      if (ngram_size_ == 1 || prefix == sequence.subspan(j, prefix_length)) {
+      if (ngram_size_ == 1 || SpanEq(prefix, sequence.subspan(j, prefix_length))) {
        blocked_word_ids.insert(sequence[static_cast<gsl::index>(j) + prefix_length]);
      }
    }
--- a/onnxruntime/contrib_ops/cpu/transformers/sequences.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/sequences.cc
@ -10,7 +10,7 @@ namespace transformers {

 void Sequences::Init(gsl::span<int32_t> buffer, int batch_beam_size, int sequence_length, int max_length) {
  size_t sequences_size = SafeInt<size_t>(batch_beam_size) * max_length;
-  assert(buffer.length() == sequences_size + sequences_size);
+  assert(buffer.size() == sequences_size + sequences_size);

  sequences[0] = buffer.subspan(0, sequences_size);
  sequences[1] = buffer.subspan(sequences_size);
--- a/onnxruntime/contrib_ops/cpu/transformers/sequences.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/sequences.h
@ -3,7 +3,7 @@

 #pragma once

-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/generation_shared.h"

 namespace onnxruntime {
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
@ -7,7 +7,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
 #include "core/providers/cpu/tensor/utils.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/subgraph_base.h"
 #include "contrib_ops/cpu/transformers/dump_tensor.h"

--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h
@ -5,7 +5,7 @@

 #include <vector>
 #include <string>
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "core/framework/allocator.h"
 #include "core/framework/feeds_fetches_manager.h"
 #include "contrib_ops/cpu/transformers/generation_device_helper.h"
@ -48,7 +48,7 @@ class Subgraph {
  Status Setup(const SessionState& session_state,
               const SessionState& subgraph_session_state);

-  FeedsFetchesManager* GetFeedsFetchesManager() { 
+  FeedsFetchesManager* GetFeedsFetchesManager() {
    return (feeds_fetches_manager_.has_value()) ? &*feeds_fetches_manager_ : nullptr;
  }

--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc
@ -6,7 +6,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
 #include "core/providers/cpu/tensor/utils.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/subgraph_gpt.h"
 #include "contrib_ops/cpu/transformers/dump_tensor.h"

--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
@ -6,7 +6,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
 #include "core/providers/cpu/tensor/utils.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/subgraph_t5_decoder.h"
 #include "contrib_ops/cpu/transformers/dump_tensor.h"
 #include "contrib_ops/cpu/transformers/generation_device_helper.h"
@ -139,7 +139,7 @@ Status T5DecoderSubgraph::CreateInitialFeeds(
  AllocatorPtr allocator = session_state_->GetAllocator(encoder_feeds[0].Get<Tensor>().Location());

  // Copy beam next tokens in CPU to input_ids in provider device (CPU for CPU EP, or GPU for CUDA EP).
-  int batch_beam_size = static_cast<int>(beam_next_tokens.length());
+  int batch_beam_size = static_cast<int>(beam_next_tokens.size());
  int64_t dims[] = {batch_beam_size, 1};
  TensorShape input_ids_shape(&dims[0], 2);
  OrtValue input_ids;
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.cc
@ -6,7 +6,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
 #include "core/providers/cpu/tensor/utils.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/subgraph_t5_encoder.h"

 namespace onnxruntime {
--- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
@ -254,7 +254,7 @@ Status QkvToContext(
  } else if (nullptr != mask_index) {  // 1d mask index
    ORT_ENFORCE(mask_index_dims.size() == 1);
    // mask_index has 1D shape: either (batch_size) or (2*batch_size). Only the later one has start postions.
-    const int* mask_start = (mask_index_dims.at(0) > batch_size) ? mask_index + batch_size : nullptr;
+    const int* mask_start = (mask_index_dims[0] > batch_size) ? mask_index + batch_size : nullptr;
    ORT_RETURN_IF_ERROR(ComputeSoftmaxWithMask1D<T>(
        stream, total_sequence_length, sequence_length, batch_size, num_heads,
        mask_index, mask_start, data.extra_add_qk, scratch1, scratch2, parameters.is_unidirectional));
--- a/onnxruntime/contrib_ops/cuda/math/bias_dropout.h
+++ b/onnxruntime/contrib_ops/cuda/math/bias_dropout.h
@ -3,7 +3,6 @@

 #pragma once

-#include "gsl/gsl"
 #include "core/providers/cuda/cuda_kernel.h"
 #include "core/providers/cuda/cuda_common.h"
 #include "core/framework/random_generator.h"
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
@ -394,7 +394,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //

  gsl::span<const float> next_scores = gsl::make_span(
      cpu_state->topk_scores.data(),
-      static_cast<typename gsl::span<float>::index_type>(topk_scores->Shape().Size()));
+      static_cast<typename gsl::span<float>::size_type>(topk_scores->Shape().Size()));
  gsl::span<const int32_t> next_tokens(cpu_state->topk_tokens.data(), beam_state->next_tokens.size());
  gsl::span<const int32_t> next_indices(cpu_state->topk_indices.data(), beam_state->next_indices.size());

@ -579,7 +579,7 @@ Status PickGptPastState(const std::vector<OrtValue>& last_outputs,

    gsl::span<T> past_span = gsl::make_span<T>(past.GetMutable<Tensor>()->MutableData<T>(), past_shape.Size());
    gsl::span<const T> present_span = gsl::make_span<const T>(present.Get<Tensor>().Data<T>(), past_shape.Size());
-    for (gsl::index j = 0; j < beam_indices.length(); j++) {
+    for (size_t j = 0; j < beam_indices.size(); j++) {
      int32_t beam_index = beam_indices[j];
      gsl::span<const T> present_key = present_span.subspan(beam_index * block_size_per_beam, block_size_per_beam);
      gsl::span<const T> present_value = present_span.subspan(past_key_size + beam_index * block_size_per_beam,
@ -623,7 +623,7 @@ Status PickT5PastState(const std::vector<OrtValue>& last_outputs,

    gsl::span<T> past_span = gsl::make_span<T>(past.GetMutable<Tensor>()->MutableData<T>(), past_shape.Size());
    gsl::span<const T> present_span = gsl::make_span<const T>(present.Get<Tensor>().Data<T>(), past_shape.Size());
-    for (gsl::index j = 0; j < beam_indices.length(); j++) {
+    for (size_t j = 0; j < beam_indices.size(); j++) {
      int32_t beam_index = beam_indices[j];
      gsl::span<const T> present_beam = present_span.subspan(beam_index * block_size_per_beam, block_size_per_beam);
      gsl::span<T> past_beam = past_span.subspan(j * block_size_per_beam, block_size_per_beam);
@ -652,7 +652,7 @@ Status UpdateGptFeeds(
    int gpt_subgraph_first_past_input_idx,
    int gpt_subgraph_first_present_output_idx) {
  // Update input_ids with next tokens.
-  int batch_beam_size = static_cast<int>(beam_next_tokens.length());
+  int batch_beam_size = static_cast<int>(beam_next_tokens.size());
  int64_t dims[] = {batch_beam_size, 1};
  TensorShape input_ids_shape(&dims[0], 2);
  auto element_type = DataTypeImpl::GetType<int32_t>();
@ -732,7 +732,7 @@ Status UpdateDecoderFeeds(
  ORT_UNUSED_PARAMETER(current_length);

  // Update input_ids with next tokens.
-  int batch_beam_size = static_cast<int>(beam_next_tokens.length());
+  int batch_beam_size = static_cast<int>(beam_next_tokens.size());
  int64_t dims[] = {batch_beam_size, 1};
  TensorShape input_ids_shape(&dims[0], 2);
  auto element_type = DataTypeImpl::GetType<int32_t>();
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.h
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.h
@ -7,7 +7,7 @@
 #include "core/providers/cpu/tensor/utils.h"
 #include "core/providers/cuda/cuda_common.h"

-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/generation_shared.h"

 namespace onnxruntime {
--- a/onnxruntime/contrib_ops/rocm/bert/attention_impl.cu
+++ b/onnxruntime/contrib_ops/rocm/bert/attention_impl.cu
@ -154,7 +154,7 @@ Status QkvToContext(
  // apply softmax and store result P to scratch2: BxNxSxS*
  if (use_raw_attention_mask) {  // 2d, 3d or 4d attention mask
    const int mask_dimension = static_cast<int>(mask_index_dims.size());
-    const int max_sequence_length = mask_dimension == 4 ? static_cast<int>(mask_index_dims.at(3)) : 0;
+    const int max_sequence_length = mask_dimension == 4 ? static_cast<int>(mask_index_dims[3]) : 0;

    T* persistent_softmax_workspace = scratch1;  // replace Q*K' in place if persistent softmax is selected.
    ORT_RETURN_IF_ERROR(
@ -165,7 +165,7 @@ Status QkvToContext(
  } else if (nullptr != mask_index) {  // 1d mask index
    ORT_ENFORCE(mask_index_dims.size() == 1);
    // mask_index has 1D shape: either (batch_size) or (2*batch_size). Only the later one has start postions.
-    const int* mask_start = (mask_index_dims.at(0) > batch_size) ? mask_index + batch_size : nullptr;
+    const int* mask_start = (mask_index_dims[0] > batch_size) ? mask_index + batch_size : nullptr;
    ORT_RETURN_IF_ERROR(ComputeSoftmaxWithMask1D<T>(stream, all_sequence_length, sequence_length, batch_size, num_heads,
                                     mask_index, mask_start, extra_add_qk, scratch1, scratch2, is_unidirectional));
  } else {  // no mask
--- a/onnxruntime/core/codegen/mti/mti_tvm_utils.h
+++ b/onnxruntime/core/codegen/mti/mti_tvm_utils.h
@ -5,7 +5,7 @@

 #include <string>
 #include <vector>
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 #include <tvm/tvm.h>
 #include "core/codegen/mti/common.h"

--- a/onnxruntime/core/codegen/mti/tensor/concat_ops.cc
+++ b/onnxruntime/core/codegen/mti/tensor/concat_ops.cc
@ -4,7 +4,7 @@
 #include "core/codegen/mti/tensor/concat_ops.h"

 #include "core/codegen/mti/mti_tvm_utils.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include <topi/transform.h>

 namespace onnxruntime {
--- a/onnxruntime/core/codegen/mti/tensor/gather.cc
+++ b/onnxruntime/core/codegen/mti/tensor/gather.cc
@ -4,7 +4,7 @@
 #include "core/codegen/mti/tensor/gather.h"

 #include "core/codegen/mti/mti_tvm_utils.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include <topi/transform.h>

 namespace onnxruntime {
--- a/onnxruntime/core/codegen/mti/tensor/slice.cc
+++ b/onnxruntime/core/codegen/mti/tensor/slice.cc
@ -5,7 +5,7 @@

 #include "core/codegen/mti/mti_tvm_utils.h"
 #include <climits>
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 #include <topi/transform.h>
 #include <tvm/ir_pass.h>

--- a/onnxruntime/core/codegen/mti/tensor/split.cc
+++ b/onnxruntime/core/codegen/mti/tensor/split.cc
@ -4,7 +4,7 @@
 #include "core/codegen/mti/tensor/split.h"

 #include "core/codegen/mti/mti_tvm_utils.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include <topi/transform.h>

 namespace onnxruntime {
--- a/onnxruntime/core/codegen/mti/tensor/tile.cc
+++ b/onnxruntime/core/codegen/mti/tensor/tile.cc
@ -3,7 +3,7 @@

 #include "core/codegen/mti/tensor/tile.h"
 #include "core/codegen/mti/mti_tvm_utils.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"

 namespace onnxruntime {
 namespace tvm_codegen {
--- a/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.cc
+++ b/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.cc
@ -7,7 +7,7 @@
 #include "core/codegen/passes/utils/codegen_context.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/common.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"

 #include <topi/detail/extern.h>

--- a/onnxruntime/core/common/logging/capture.cc
+++ b/onnxruntime/core/common/logging/capture.cc
@ -3,7 +3,7 @@

 #include "core/common/logging/capture.h"
 #include "core/common/logging/logging.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"

 namespace onnxruntime {
 namespace logging {
@ -43,7 +43,7 @@ void Capture::ProcessPrintf(msvc_printf_check const char* format, va_list args)
  const int nbrcharacters = vsnprintf(message.data(), message.size(), format, args);
 #endif
  error = nbrcharacters < 0;
-  truncated = (nbrcharacters >= 0 && static_cast<gsl::index>(nbrcharacters) > message.size());
+  truncated = (nbrcharacters >= 0 && static_cast<size_t>(nbrcharacters) > message.size());
 #endif

  if (error) {
--- a/onnxruntime/core/flatbuffers/flatbuffers_utils.cc
+++ b/onnxruntime/core/flatbuffers/flatbuffers_utils.cc
@ -3,9 +3,8 @@

 #include "core/flatbuffers/flatbuffers_utils.h"

-#include "gsl/gsl"
-
 #include "core/common/common.h"
+#include "core/common/gsl.h"
 #include "core/flatbuffers/schema/ort.fbs.h"
 #include "core/graph/constants.h"
 #include "core/graph/onnx_protobuf.h"
--- a/onnxruntime/core/framework/allocation_planner.cc
+++ b/onnxruntime/core/framework/allocation_planner.cc
@ -605,10 +605,10 @@ class PlannerImpl {
        UseCount(name)++;

        bool is_graph_input = (graph_inputs.find(name) != graph_inputs.cend());
-        bool is_outer_scope_arg = std::find_if(outer_scope_node_args_.cbegin(), outer_scope_node_args_.cend(),
+        bool is_outer_scope_arg = std::find_if(outer_scope_node_args_.begin(), outer_scope_node_args_.end(),
                                               [&name](const NodeArg* value) {
                                                 return value && value->Name() == name;
-                                               }) != outer_scope_node_args_.cend();
+                                               }) != outer_scope_node_args_.end();
        bool is_subgraph = (parent_node_ != nullptr);

        // If it's a graph input or outer scope node arg, set its plan.
--- a/onnxruntime/core/framework/allocatormgr.cc
+++ b/onnxruntime/core/framework/allocatormgr.cc
@ -2,12 +2,15 @@
 // Licensed under the MIT License.

 #include "core/framework/allocatormgr.h"
-#include "core/framework/bfc_arena.h"
-#include "core/common/logging/logging.h"
+
+#include <limits>
 #include <mutex>
 #include <sstream>
 #include <unordered_map>
-#include <limits>
+
+#include "core/common/logging/logging.h"
+#include "core/common/narrow.h"
+#include "core/framework/bfc_arena.h"

 namespace onnxruntime {
 using namespace common;
@ -15,9 +18,9 @@ using namespace common;
 namespace {
 int32_t MakeKey(OrtMemType mem_type, OrtDevice device) {
  // shorten device id so we can fit everything
-  uint8_t short_device = gsl::narrow<uint8_t>(device.Id());
+  uint8_t short_device = narrow<uint8_t>(device.Id());
  // and convert mem_type. OrtMemType weirdly uses -2 as the first value so we offset by that before narrowing
-  uint8_t ort_mem_type = gsl::narrow<uint8_t>(mem_type + 2);
+  uint8_t ort_mem_type = narrow<uint8_t>(mem_type + 2);

  // NOTE: OrtMemType is the type of memory for a kernel's input/output
  //       OrtDevice.MemType is the device memory type.
--- a/onnxruntime/core/framework/data_transfer_utils.h
+++ b/onnxruntime/core/framework/data_transfer_utils.h
@ -5,7 +5,7 @@

 #include <type_traits>

-#include "gsl/gsl"
+#include "core/common/gsl.h"

 #include "core/common/common.h"
 #include "core/framework/tensor.h"
--- a/onnxruntime/core/framework/endian_utils.h
+++ b/onnxruntime/core/framework/endian_utils.h
@ -5,7 +5,7 @@

 #include <type_traits>

-#include "gsl/gsl"
+#include "core/common/gsl.h"

 #include "core/common/status.h"
 #include "core/common/common.h"
--- a/onnxruntime/core/framework/error_code.cc
+++ b/onnxruntime/core/framework/error_code.cc
@ -1,7 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

-#include "core/common/gsl_suppress.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "core/session/ort_apis.h"
 #include "core/common/status.h"
--- a/onnxruntime/core/framework/fallback_cpu_capability.h
+++ b/onnxruntime/core/framework/fallback_cpu_capability.h
@ -3,8 +3,7 @@

 #pragma once

-#include <gsl/gsl>
-
+#include "core/common/gsl.h"
 #include "core/common/inlined_containers_fwd.h"
 #include "core/framework/execution_provider.h"  // for IExecutionProvider::IKernelLookup
 #include "core/graph/graph_viewer.h"
--- a/onnxruntime/core/framework/kernel_def_builder.cc
+++ b/onnxruntime/core/framework/kernel_def_builder.cc
@ -7,8 +7,6 @@
 #include <unordered_set>
 #include <string>

-#include "gsl/gsl"
-
 namespace onnxruntime {
 namespace {

--- a/onnxruntime/core/framework/kernel_lookup.h
+++ b/onnxruntime/core/framework/kernel_lookup.h
@ -3,9 +3,8 @@

 #pragma once

-#include "gsl/gsl"
-
 #include "core/common/common.h"
+#include "core/common/gsl.h"
 #include "core/framework/execution_provider.h"  // for IExecutionProvider::IKernelLookup
 #include "core/framework/kernel_registry.h"
 #include "core/framework/kernel_type_str_resolver.h"
--- a/onnxruntime/core/framework/kernel_registry_manager.h
+++ b/onnxruntime/core/framework/kernel_registry_manager.h
@ -7,8 +7,7 @@
 #include <variant>
 #include <unordered_map>

-#include "gsl/gsl"
-
+#include "core/common/gsl.h"
 #include "core/common/inlined_containers.h"
 #include "core/common/status.h"
 #include "core/framework/kernel_type_str_resolver.h"
--- a/onnxruntime/core/framework/kernel_type_str_resolver.h
+++ b/onnxruntime/core/framework/kernel_type_str_resolver.h
@ -7,12 +7,11 @@
 #include <string_view>
 #include <utility>

-#include "gsl/gsl"
-
 #if !defined(ORT_MINIMAL_BUILD)
 #include "onnx/defs/schema.h"
 #endif  // !defined(ORT_MINIMAL_BUILD)

+#include "core/common/gsl.h"
 #include "core/common/inlined_containers.h"
 #include "core/common/status.h"
 #include "core/graph/op_identifier.h"
--- a/onnxruntime/core/framework/kernel_type_str_resolver_utils.h
+++ b/onnxruntime/core/framework/kernel_type_str_resolver_utils.h
@ -5,8 +5,7 @@

 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)

-#include "gsl/gsl"
-
+#include "core/common/gsl.h"
 #include "core/common/status.h"
 #include "core/framework/kernel_type_str_resolver.h"
 #include "core/graph/op_identifier.h"
--- a/onnxruntime/core/framework/math.h
+++ b/onnxruntime/core/framework/math.h
@ -3,8 +3,7 @@

 #pragma once

-#include <gsl/gsl>
-
+#include "core/common/narrow.h"
 #include "core/framework/tensor.h"
 #include "core/util/math_cpuonly.h"

@ -12,12 +11,12 @@ namespace onnxruntime {

 template <typename T>
 auto EigenMap(Tensor& t) -> EigenVectorMap<T> {
-  return EigenVectorMap<T>(t.MutableData<T>(), gsl::narrow<ptrdiff_t>(t.Shape().Size()));
+  return EigenVectorMap<T>(t.MutableData<T>(), narrow<ptrdiff_t>(t.Shape().Size()));
 }

 template <typename T>
 auto EigenMap(const Tensor& t) -> ConstEigenVectorMap<T> {
-  return ConstEigenVectorMap<T>(t.Data<T>(), gsl::narrow<ptrdiff_t>(t.Shape().Size()));
+  return ConstEigenVectorMap<T>(t.Data<T>(), narrow<ptrdiff_t>(t.Shape().Size()));
 }

 }  // namespace onnxruntime
--- a/onnxruntime/core/framework/onnxruntime_typeinfo.h
+++ b/onnxruntime/core/framework/onnxruntime_typeinfo.h
@ -4,7 +4,6 @@
 #pragma once
 #include <atomic>
 #include <string>
-#include "core/common/gsl_suppress.h"
 #include "core/session/onnxruntime_c_api.h"

 namespace onnxruntime {
--- a/onnxruntime/core/framework/op_node_proto_helper.cc
+++ b/onnxruntime/core/framework/op_node_proto_helper.cc
@ -6,7 +6,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/graph/op.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"

 using namespace ONNX_NAMESPACE;
 using namespace ::onnxruntime::common;
--- a/onnxruntime/core/framework/random_seed.cc
+++ b/onnxruntime/core/framework/random_seed.cc
@ -3,14 +3,13 @@

 #include "random_seed.h"
 #include "random_generator.h"
-#include "core/common/gsl_suppress.h"
 #include <atomic>
 #include <chrono>

 namespace onnxruntime {
 namespace utils {

-// "Global initializer calls a non-constexpr function." 
+// "Global initializer calls a non-constexpr function."
 //TODO: Fix the warning. The variable should be put in the environment class.
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(push)
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@ -5,7 +5,7 @@

 #include <string>
 #include <vector>
-#include "core/common/gsl_suppress.h"
+#include "core/common/gsl.h"
 #include "core/common/inlined_containers.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "core/optimizer/graph_transformer_level.h"
--- a/Show more
+++ b/Show more