From 2ecd1d662203d3fb5d77bf2aa97f6b8bcd7d1759 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Sat, 29 Oct 2022 04:15:20 -0700
Subject: [PATCH] Switch GSL to MS GSL 4.0.0 (#13416)

---
 cgmanifests/cgmanifest.json                   |    4 +-
 cgmanifests/generated/cgmanifest.json         |    2 +-
 cmake/CMakeLists.txt                          |   17 +-
 cmake/external/googletest                     |    2 +-
 cmake/external/gsl.cmake                      |   16 +
 cmake/external/gsl.natvis                     |   14 -
 cmake/onnxruntime_common.cmake                |    9 +-
 cmake/onnxruntime_flatbuffers.cmake           |    4 +-
 cmake/onnxruntime_mlas.cmake                  |    1 +
 cmake/onnxruntime_providers.cmake             |    2 +-
 cmake/onnxruntime_unittests.cmake             |    2 +
 cmake/winml.cmake                             |   20 +-
 cmake/winml_unittests.cmake                   |    2 +-
 include/onnxruntime/core/common/common.h      |    1 -
 include/onnxruntime/core/common/gsl.h         |    6 +
 .../onnxruntime/core/common/gsl_suppress.h    |   15 -
 .../onnxruntime/core/common/logging/capture.h |    2 +-
 include/onnxruntime/core/common/narrow.h      |   77 +
 include/onnxruntime/core/common/span_utils.h  |   36 +-
 include/onnxruntime/core/common/status.h      |    2 +-
 .../onnxruntime/core/framework/data_types.h   |    2 +-
 include/onnxruntime/core/framework/float16.h  |    6 +-
 .../onnxruntime/core/framework/op_kernel.h    |    2 +-
 .../core/framework/op_kernel_info.h           |    2 +-
 .../core/framework/op_node_proto_helper.h     |    2 +-
 include/onnxruntime/core/framework/tensor.h   |    4 +-
 .../onnxruntime/core/framework/tensor_shape.h |    6 +-
 include/onnxruntime/core/graph/graph.h        |   17 +-
 .../core/session/onnxruntime_c_api.h          |    1 -
 .../cpu/attnlstm/attention_mechanism.h        |    2 +-
 .../cpu/attnlstm/attention_wrapper.cc         |    2 +-
 .../cpu/attnlstm/bahdanau_attention.cc        |    6 +-
 .../cpu/attnlstm/deep_cpu_attn_lstm.cc        |   35 +-
 .../cpu/attnlstm/deep_cpu_attn_lstm.h         |    3 +-
 .../cpu/attnlstm/uni_dir_attn_lstm.cc         |   20 +-
 .../cpu/attnlstm/uni_dir_attn_lstm.h          |    4 +-
 .../contrib_ops/cpu/bert/attention_helper.h   |    2 +-
 .../cpu/bert/bifurcation_detector.h           |    8 +-
 .../contrib_ops/cpu/bert/ngram_repeat_block.h |    7 +-
 onnxruntime/contrib_ops/cpu/crop.h            |    2 +-
 onnxruntime/contrib_ops/cpu/image_scaler.h    |    5 +-
 onnxruntime/contrib_ops/cpu/inverse.cc        |   11 +-
 .../cpu/math/sparse_dense_matmul.cc           |   11 +-
 .../contrib_ops/cpu/maxpool_with_mask.h       |    7 +-
 onnxruntime/contrib_ops/cpu/nchwc_ops.cc      |   17 +-
 .../cpu/quantization/dynamic_quantize_lstm.cc |    5 +-
 .../quantization/dynamic_quantize_matmul.cc   |    5 +-
 .../cpu/quantization/qlinear_activations.cc   |    3 +-
 .../cpu/quantization/qlinear_concat.cc        |    5 +-
 .../qlinear_global_average_pool.cc            |   13 +-
 .../cpu/quantization/qlinear_softmax.cc       |    2 +-
 onnxruntime/contrib_ops/cpu/tokenizer.cc      |    7 +-
 .../cpu/transformers/beam_search.cc           |    2 +-
 .../cpu/transformers/beam_search_impl_gpt.h   |    8 +-
 .../cpu/transformers/beam_search_impl_t5.h    |    9 +-
 .../cpu/transformers/beam_search_scorer.cc    |    5 +-
 .../cpu/transformers/generate_impl_base.h     |    3 +-
 .../transformers/generation_device_helper.cc  |   10 +-
 .../transformers/generation_device_helper.h   |    2 +-
 .../cpu/transformers/generation_shared.h      |    2 +-
 .../cpu/transformers/greedy_search.cc         |    2 +-
 .../cpu/transformers/greedy_search_impl_gpt.h |    4 +-
 .../cpu/transformers/logits_processor.cc      |   10 +-
 .../contrib_ops/cpu/transformers/sequences.cc |    2 +-
 .../contrib_ops/cpu/transformers/sequences.h  |    2 +-
 .../cpu/transformers/subgraph_base.cc         |    2 +-
 .../cpu/transformers/subgraph_base.h          |    4 +-
 .../cpu/transformers/subgraph_gpt.cc          |    2 +-
 .../cpu/transformers/subgraph_t5_decoder.cc   |    4 +-
 .../cpu/transformers/subgraph_t5_encoder.cc   |    2 +-
 .../contrib_ops/cuda/bert/attention_impl.cu   |    2 +-
 .../contrib_ops/cuda/math/bias_dropout.h      |    1 -
 .../transformers/generation_device_helper.cc  |   10 +-
 .../transformers/generation_device_helper.h   |    2 +-
 .../contrib_ops/rocm/bert/attention_impl.cu   |    4 +-
 onnxruntime/core/codegen/mti/mti_tvm_utils.h  |    2 +-
 .../core/codegen/mti/tensor/concat_ops.cc     |    2 +-
 onnxruntime/core/codegen/mti/tensor/gather.cc |    2 +-
 onnxruntime/core/codegen/mti/tensor/slice.cc  |    2 +-
 onnxruntime/core/codegen/mti/tensor/split.cc  |    2 +-
 onnxruntime/core/codegen/mti/tensor/tile.cc   |    2 +-
 .../codegen/passes/utils/ort_tvm_utils.cc     |    2 +-
 onnxruntime/core/common/logging/capture.cc    |    4 +-
 .../core/flatbuffers/flatbuffers_utils.cc     |    3 +-
 .../core/framework/allocation_planner.cc      |    4 +-
 onnxruntime/core/framework/allocatormgr.cc    |   13 +-
 .../core/framework/data_transfer_utils.h      |    2 +-
 onnxruntime/core/framework/endian_utils.h     |    2 +-
 onnxruntime/core/framework/error_code.cc      |    1 -
 .../core/framework/fallback_cpu_capability.h  |    3 +-
 .../core/framework/kernel_def_builder.cc      |    2 -
 onnxruntime/core/framework/kernel_lookup.h    |    3 +-
 .../core/framework/kernel_registry_manager.h  |    3 +-
 .../core/framework/kernel_type_str_resolver.h |    3 +-
 .../kernel_type_str_resolver_utils.h          |    3 +-
 onnxruntime/core/framework/math.h             |    7 +-
 .../core/framework/onnxruntime_typeinfo.h     |    1 -
 .../core/framework/op_node_proto_helper.cc    |    2 +-
 onnxruntime/core/framework/random_seed.cc     |    3 +-
 onnxruntime/core/framework/session_options.h  |    2 +-
 onnxruntime/core/framework/session_state.h    |    2 +-
 .../core/framework/session_state_utils.cc     |    4 +-
 onnxruntime/core/framework/sparse_tensor.cc   |   31 +-
 onnxruntime/core/framework/sparse_utils.cc    |   14 +-
 .../core/framework/tensor_type_and_shape.cc   |    4 +-
 .../core/framework/tensorprotoutils.cc        |   28 +-
 onnxruntime/core/framework/transpose_helper.h |    4 +-
 onnxruntime/core/graph/graph.cc               |   13 +-
 .../core/graph/graph_flatbuffers_utils.cc     |   16 +-
 .../core/graph/graph_flatbuffers_utils.h      |   48 +-
 onnxruntime/core/graph/graph_utils.h          |    9 +-
 onnxruntime/core/graph/model.cc               |    2 +-
 onnxruntime/core/graph/model.h                |    1 -
 onnxruntime/core/graph/model_load_utils.h     |    1 -
 onnxruntime/core/graph/node_attr_utils.h      |    2 +-
 .../runtime_optimization_record_container.cc  |    2 +-
 .../core/optimizer/embed_layer_norm_fusion.cc |    6 +-
 .../optimizer/free_dim_override_transformer.h |    2 +-
 onnxruntime/core/optimizer/initializer.cc     |    2 +-
 .../core/optimizer/propagate_cast_ops.cc      |   22 +-
 .../optimizer/qdq_transformer/qdq_util.cc     |    3 +-
 .../optimizer/selectors_actions/actions.cc    |    2 +-
 .../optimizer/selectors_actions/actions.h     |    3 +-
 .../optimizer/selectors_actions/helpers.cc    |   14 +-
 .../optimizer/selectors_actions/helpers.h     |    7 +-
 .../transpose_optimizer/optimizer_api_impl.cc |    4 +-
 .../transpose_optimizer.cc                    |    2 +-
 onnxruntime/core/platform/env.cc              |   13 -
 onnxruntime/core/platform/env.h               |    4 +-
 onnxruntime/core/platform/path_lib.cc         |    2 -
 onnxruntime/core/platform/posix/env.cc        |    8 +-
 onnxruntime/core/platform/windows/env.cc      |    7 +-
 .../core/platform/windows/stacktrace.cc       |    2 +-
 .../core/providers/cpu/controlflow/if.h       |    1 -
 .../core/providers/cpu/controlflow/loop.cc    |    6 +-
 .../core/providers/cpu/controlflow/loop.h     |    1 -
 .../core/providers/cpu/controlflow/scan.h     |    2 +-
 .../core/providers/cpu/controlflow/scan_8.cc  |    2 +-
 .../core/providers/cpu/controlflow/scan_9.cc  |    2 +-
 .../providers/cpu/controlflow/scan_utils.cc   |    4 +-
 .../cpu/generator/constant_of_shape_base.h    |    2 +-
 .../core/providers/cpu/generator/random.cc    |    2 -
 .../core/providers/cpu/generator/random.h     |    2 +-
 .../einsum_typed_compute_processor.cc         |    8 +-
 .../providers/cpu/math/element_wise_ops.cc    |   41 +-
 .../providers/cpu/math/element_wise_ops.h     |    7 +-
 onnxruntime/core/providers/cpu/math/gemm.cc   |    6 +-
 onnxruntime/core/providers/cpu/math/hardmax.h |    2 +-
 onnxruntime/core/providers/cpu/math/sign.cc   |    6 +-
 onnxruntime/core/providers/cpu/math/softmax.h |    2 +-
 .../core/providers/cpu/math/softmax_shared.cc |    2 +-
 onnxruntime/core/providers/cpu/ml/cast_map.cc |    2 +-
 .../core/providers/cpu/ml/category_mapper.cc  |    6 +-
 .../providers/cpu/ml/feature_vectorizer.cc    |   14 +-
 .../core/providers/cpu/ml/label_encoder.cc    |    6 +-
 .../core/providers/cpu/ml/linearclassifier.cc |    6 +-
 onnxruntime/core/providers/cpu/ml/ml_common.h |    6 +-
 .../core/providers/cpu/ml/normalizer.cc       |    1 -
 .../core/providers/cpu/ml/normalizer.h        |    2 -
 .../core/providers/cpu/ml/svmclassifier.cc    |    8 +-
 onnxruntime/core/providers/cpu/nn/Unpool.cc   |    5 +-
 .../core/providers/cpu/nn/batch_norm.h        |    3 +-
 onnxruntime/core/providers/cpu/nn/conv.cc     |    9 +-
 .../core/providers/cpu/nn/conv_attributes.h   |    2 +-
 onnxruntime/core/providers/cpu/nn/flatten.h   |    2 +-
 onnxruntime/core/providers/cpu/nn/lrn.h       |    2 +-
 onnxruntime/core/providers/cpu/nn/shrink.cc   |    4 +-
 .../core/providers/cpu/nn/tfidfvectorizer.cc  |    4 +-
 .../quantization/quantize_linear_matmul.cc    |    7 +-
 .../providers/cpu/reduction/reduction_ops.cc  |   13 +-
 .../core/providers/cpu/rnn/deep_cpu_gru.cc    |   54 +-
 .../core/providers/cpu/rnn/deep_cpu_gru.h     |    5 +-
 .../core/providers/cpu/rnn/lstm_base.cc       |   11 +-
 .../core/providers/cpu/rnn/lstm_base.h        |    3 +-
 .../core/providers/cpu/rnn/rnn_helpers.cc     |    4 +-
 .../core/providers/cpu/rnn/rnn_helpers.h      |   36 +-
 .../providers/cpu/rnn/uni_directional_lstm.cc |   24 +-
 .../providers/cpu/rnn/uni_directional_lstm.h  |    1 -
 .../providers/cpu/sequence/sequence_ops.cc    |   10 +-
 .../core/providers/cpu/tensor/cast_op.cc      |   11 +-
 .../core/providers/cpu/tensor/gather.cc       |   17 +-
 .../cpu/tensor/mean_variance_normalization.h  |    2 +-
 .../core/providers/cpu/tensor/padbase.h       |    2 +-
 .../core/providers/cpu/tensor/reshape.h       |    1 -
 .../providers/cpu/tensor/reverse_sequence.cc  |    2 +-
 .../core/providers/cpu/tensor/scatter.cc      |   25 +-
 .../core/providers/cpu/tensor/shape_op.h      |    2 +-
 .../core/providers/cpu/tensor/slice.cc        |   21 +-
 .../cpu/tensor/slice_compute_metadata.h       |    2 +-
 .../core/providers/cpu/tensor/slice_helper.h  |    4 +-
 .../core/providers/cpu/tensor/split.cc        |   11 +-
 onnxruntime/core/providers/cpu/tensor/tile.cc |    1 -
 .../core/providers/cpu/tensor/transpose.h     |    2 +-
 .../core/providers/cpu/tensor/unique.cc       |    2 +-
 .../core/providers/cpu/tensor/unsqueeze.cc    |    6 +-
 onnxruntime/core/providers/cpu/tensor/utils.h |    7 +-
 .../core/providers/cpu/tensor/where_op.cc     |   10 +-
 .../core/providers/cuda/controlflow/if.h      |    1 -
 .../core/providers/cuda/controlflow/scan.h    |    1 -
 onnxruntime/core/providers/cuda/cuda_common.h |    2 +-
 .../providers/cuda/cuda_provider_factory.cc   |    2 +-
 .../core/providers/cuda/cudnn_common.cc       |    2 +-
 .../core/providers/cuda/math/softmax.h        |    2 +-
 .../providers/cuda/multi_tensor/common.cuh    |    2 +-
 onnxruntime/core/providers/cuda/nn/conv.cc    |    7 +-
 onnxruntime/core/providers/cuda/nn/pool.cc    |    2 +-
 onnxruntime/core/providers/cuda/nn/shrink.h   |    1 -
 .../core/providers/cuda/rnn/cudnn_rnn_base.h  |    4 +-
 onnxruntime/core/providers/cuda/rnn/gru.h     |    2 +-
 onnxruntime/core/providers/cuda/rnn/rnn.h     |    1 -
 .../providers/cuda/shared_inc/cuda_utils.h    |    2 +-
 .../providers/cuda/tensor/quantize_linear.h   |    1 -
 .../core/providers/cuda/tensor/reshape.h      |    3 +-
 .../core/providers/cuda/tensor/transpose.cc   |    2 +-
 .../core/providers/cuda/tensor/transpose.h    |    2 +-
 .../src/ExecutionProvider.cpp                 |    6 +-
 .../DmlOperatorMeanVarianceNormalization.cpp  |    3 +-
 .../dml/DmlExecutionProvider/src/precomp.h    |    2 +-
 .../providers/dml/GraphTransformers/precomp.h |    2 +-
 .../OperatorAuthorHelper/OperatorHelper.cpp   |   12 +-
 .../dml/OperatorAuthorHelper/precomp.h        |    2 +-
 .../providers/dnnl/dnnl_execution_provider.cc |    7 +-
 .../dnnl/subgraph/dnnl_subgraph_primitive.h   |    7 +-
 .../builders/impl/slice_op_builder.cc         |    2 +-
 .../builders/op_builder_helpers.cc            |   13 +-
 .../core/providers/rocm/miopen_common.cc      |    2 +-
 onnxruntime/core/providers/rocm/nn/conv.cc    |   13 +-
 .../core/providers/rocm/nn/conv_transpose.cc  |    4 +-
 onnxruntime/core/providers/rocm/rocm_common.h |    2 +-
 .../providers/rocm/rocm_provider_factory.cc   |    2 +-
 .../providers/shared_library/provider_api.h   |    2 +-
 .../shared_library/provider_wrappedtypes.h    |    2 +-
 .../tensorrt/tensorrt_execution_provider.cc   |    2 +-
 .../core/providers/xnnpack/detail/utils.cc    |    2 +-
 onnxruntime/core/providers/xnnpack/nn/conv.cc |    2 +-
 .../core/session/abi_session_options.cc       |    3 -
 onnxruntime/core/session/inference_session.cc |    2 +-
 onnxruntime/core/session/onnxruntime_c_api.cc |   28 +-
 onnxruntime/core/util/math_cpu.cc             |   12 +-
 onnxruntime/gsl/gsl                           |   27 -
 onnxruntime/gsl/gsl-lite-vc6.hpp              |  697 ----
 onnxruntime/gsl/gsl-lite.h                    |   29 -
 onnxruntime/gsl/gsl-lite.hpp                  | 2836 -----------------
 .../python/onnxruntime_pybind_mlvalue.cc      |    4 +-
 onnxruntime/test/common/narrow_test.cc        |   76 +
 onnxruntime/test/common/span_utils_test.cc    |    8 +-
 .../test/common/tensor_op_test_utils.h        |    4 +-
 .../test/contrib_ops/beam_search_test.cc      |    6 +-
 .../dynamic_quantize_matmul_test.cc           |    5 +-
 .../contrib_ops/math/matmul_sparse_test.cc    |    4 +-
 .../matmul_integer_to_float_test.cc           |    7 +-
 .../contrib_ops/quantize_attention_op_test.cc |    3 +-
 onnxruntime/test/eager/ort_invoker_test.cc    |    9 +-
 .../test/framework/execution_frame_test.cc    |    6 +-
 .../test/framework/sparse_kernels_test.cc     |   73 +-
 .../test/framework/tensor_shape_test.cc       |   10 +-
 onnxruntime/test/framework/test_utils.h       |    2 +-
 onnxruntime/test/ir/graph_test.cc             |    9 +-
 onnxruntime/test/onnx/tensorprotoutils.cc     |    1 -
 .../optimizer/avx2_weight_s8_to_u8_test.cc    |   18 +-
 .../test/optimizer/initializer_test.cc        |    4 +-
 onnxruntime/test/platform/file_io_test.cc     |    9 +-
 .../providers/cpu/controlflow/scan_test.cc    |    2 +-
 .../cpu/ml/array_feature_extractor_test.cc    |    2 +-
 onnxruntime/test/providers/cpu/model_tests.cc |    1 -
 .../test/providers/cpu/tensor/cast_op_test.cc |    2 +-
 .../providers/cpu/tensor/where_op_test.cc     |    2 +-
 .../internal_testing_tests.cc                 |    3 +-
 .../test/providers/provider_test_utils.cc     |   16 +-
 .../test/providers/provider_test_utils.h      |    8 +-
 onnxruntime/test/shared_lib/test_inference.cc |    8 +-
 .../test/shared_lib/test_nontensor_types.cc   |   52 +-
 .../my_ep_factory.cc                          |    4 +-
 onnxruntime/test/util/test_utils.cc           |   28 +-
 .../orttraining/core/framework/pipeline.h     |    3 +-
 .../core/graph/optimizer_builder.h            |   18 +-
 .../graph/zero_optimizer_graph_builder.cc     |    9 +-
 .../core/optimizer/graph_transformer_utils.h  |    4 +-
 .../orttraining/core/session/tensor_helper.cc |    6 +-
 orttraining/orttraining/eager/ort_aten.cpp    |    3 +-
 .../graph/optimizer_graph_builder_test.cc     |    3 +-
 .../session/training_session_test_utils.cc    |    4 +-
 .../common/synthetic_data_loader.h            |    2 +-
 .../cpu/activation/activations_grad.cc        |    2 +-
 .../training_ops/cpu/loss/cross_entropy.cc    |    2 +-
 .../cpu/loss/softmax_cross_entropy_loss.cc    |    2 +-
 .../training_ops/cpu/op_gradients.cc          |    2 +-
 .../training_ops/cpu/tensor/split.cc          |   13 +-
 .../training_ops/cuda/nn/batch_norm_grad.h    |    2 -
 .../cuda/nn/batch_norm_internal.h             |    1 -
 .../training_ops/rocm/nn/batch_norm_grad.h    |    2 -
 .../rocm/nn/batch_norm_internal.h             |    1 -
 292 files changed, 1128 insertions(+), 4486 deletions(-)
 create mode 100644 cmake/external/gsl.cmake
 delete mode 100644 cmake/external/gsl.natvis
 create mode 100644 include/onnxruntime/core/common/gsl.h
 delete mode 100644 include/onnxruntime/core/common/gsl_suppress.h
 create mode 100644 include/onnxruntime/core/common/narrow.h
 delete mode 100644 onnxruntime/gsl/gsl
 delete mode 100644 onnxruntime/gsl/gsl-lite-vc6.hpp
 delete mode 100644 onnxruntime/gsl/gsl-lite.h
 delete mode 100644 onnxruntime/gsl/gsl-lite.hpp
 create mode 100644 onnxruntime/test/common/narrow_test.cc

diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json
index d20ff130df..bbe1d28be0 100644
--- a/cgmanifests/cgmanifest.json
+++ b/cgmanifests/cgmanifest.json
@@ -28,8 +28,8 @@
          "component": {
             "type": "git",
             "git": {
-               "commitHash": "58123b93bd7f12d17ac0c46379a0f2c0255d9213",
-               "repositoryUrl": "https://github.com/martinmoene/gsl-lite.git"
+               "commitHash": "a3534567187d2edc428efd3f13466ff75fe5805c",
+               "repositoryUrl": "https://github.com/microsoft/gsl.git"
             }
          }
       },
diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index f7ddab924a..8cfff8b5bc 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -193,7 +193,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "53495a2a7d6ba7e0691a7f3602e9a5324bba6e45",
+          "commitHash": "58d77fa8070e8cec2dc1ed015d66b454c8d78850",
           "repositoryUrl": "https://github.com/google/googletest.git"
         },
         "comments": "git submodule at cmake/external/googletest"
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 421d4553cb..ed44a2c95b 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -1016,27 +1016,14 @@ if (CPUINFO_SUPPORTED)
   endif()
 endif()
 
-# bounds checking behavior.
-# throw instead of calling terminate if there's a bounds checking violation.
-# we make it through via a handler so CUDA does not complain
-# The following -DGSL macros are recognized by gsl-lite along with -Dgsl macros
-# no bounds checking in release build so no perf cost
-# if we enable onnxruntime_DISABLE_EXCEPTIONS, gsl will terminate
-if (onnxruntime_DISABLE_EXCEPTIONS)
-  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DGSL_TERMINATE_ON_CONTRACT_VIOLATION")
-else()
-  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DGSL_THROW_ON_CONTRACT_VIOLATION")
-endif()
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DGSL_UNENFORCED_ON_CONTRACT_VIOLATION")
-set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DGSL_UNENFORCED_ON_CONTRACT_VIOLATION")
-set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} -DGSL_UNENFORCED_ON_CONTRACT_VIOLATION")
+include(gsl)
 
 include(eigen)
 
 #onnxruntime_EXTERNAL_LIBRARIES could contain onnx, onnx_proto,libprotobuf, cuda/cudnn,
 # dnnl/mklml, onnxruntime_codegen_tvm, tvm and pthread
 # pthread is always at the last
-set(onnxruntime_EXTERNAL_LIBRARIES onnx onnx_proto ${PROTOBUF_LIB} re2::re2)
+set(onnxruntime_EXTERNAL_LIBRARIES onnx onnx_proto ${PROTOBUF_LIB} re2::re2 ${GSL_TARGET})
 
 if(NOT onnxruntime_DISABLE_ABSEIL)
   set(ABSEIL_LIBS absl::inlined_vector absl::flat_hash_set
diff --git a/cmake/external/googletest b/cmake/external/googletest
index 53495a2a7d..58d77fa807 160000
--- a/cmake/external/googletest
+++ b/cmake/external/googletest
@@ -1 +1 @@
-Subproject commit 53495a2a7d6ba7e0691a7f3602e9a5324bba6e45
+Subproject commit 58d77fa8070e8cec2dc1ed015d66b454c8d78850
diff --git a/cmake/external/gsl.cmake b/cmake/external/gsl.cmake
new file mode 100644
index 0000000000..bbba51a80f
--- /dev/null
+++ b/cmake/external/gsl.cmake
@@ -0,0 +1,16 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+include(FetchContent)
+
+FetchContent_Declare(
+    GSL
+    GIT_REPOSITORY https://github.com/microsoft/gsl
+    GIT_TAG a3534567187d2edc428efd3f13466ff75fe5805c  # v4.0.0
+    GIT_SHALLOW ON
+    )
+
+FetchContent_MakeAvailable(GSL)
+
+set(GSL_TARGET "Microsoft.GSL::GSL")
+set(GSL_INCLUDE_DIR "$<TARGET_PROPERTY:${GSL_TARGET},INTERFACE_INCLUDE_DIRECTORIES>")
diff --git a/cmake/external/gsl.natvis b/cmake/external/gsl.natvis
deleted file mode 100644
index 5fbe9b9da1..0000000000
--- a/cmake/external/gsl.natvis
+++ /dev/null
@@ -1,14 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
-  <Type Name="gsl::span&lt;*&gt;">
-    <Intrinsic Name="_size" Expression="(last_ - first_)"/>
-    <DisplayString>{{ size={ _size() }}}</DisplayString>
-    <Expand>
-      <Item Name="[size]" ExcludeView="simple">_size()</Item>
-      <IndexListItems Condition="_size() &gt; 0">
-        <Size>_size()</Size>
-        <ValueNode>first_[$i]</ValueNode>
-      </IndexListItems>
-    </Expand>
-  </Type>
-</AutoVisualizer>
diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake
index a4426d6713..fe301d4bd6 100644
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@@ -78,7 +78,7 @@ file(GLOB onnxruntime_common_src CONFIGURE_DEPENDS
 # Remove new/delete intercept. To deal with memory leaks
 # Use either non-mimalloc build OR use mimalloc built-in features.
 if(WIN32 AND onnxruntime_USE_MIMALLOC)
-    list(REMOVE_ITEM onnxruntime_common_src 
+    list(REMOVE_ITEM onnxruntime_common_src
     "${ONNXRUNTIME_ROOT}/core/platform/windows/debug_alloc.cc"
     "${ONNXRUNTIME_ROOT}/core/platform/windows/debug_alloc.h")
 endif()
@@ -116,11 +116,6 @@ if(NOT onnxruntime_DISABLE_ABSEIL)
     target_sources(
         onnxruntime_common
         INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/external/${ABSEIL_NATVIS_FILE}>)
-    set(GSL_NATVIS_FILE "gsl.natvis")
-    target_sources(
-        onnxruntime_common
-        INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/external/${GSL_NATVIS_FILE}>
-    )
   endif()
 endif()
 
@@ -131,7 +126,7 @@ target_include_directories(onnxruntime_common
     PUBLIC
         ${OPTIONAL_LITE_INCLUDE_DIR})
 
-target_link_libraries(onnxruntime_common safeint_interface Boost::mp11)
+target_link_libraries(onnxruntime_common safeint_interface Boost::mp11 ${GSL_TARGET})
 
 if(NOT WIN32)
   target_include_directories(onnxruntime_common PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/external/nsync/public")
diff --git a/cmake/onnxruntime_flatbuffers.cmake b/cmake/onnxruntime_flatbuffers.cmake
index b9d97c962d..d87dc94fa1 100644
--- a/cmake/onnxruntime_flatbuffers.cmake
+++ b/cmake/onnxruntime_flatbuffers.cmake
@@ -9,7 +9,7 @@ file(GLOB onnxruntime_flatbuffers_srcs CONFIGURE_DEPENDS
 source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_flatbuffers_srcs})
 
 onnxruntime_add_static_library(onnxruntime_flatbuffers ${onnxruntime_flatbuffers_srcs})
-onnxruntime_add_include_to_target(onnxruntime_flatbuffers onnx flatbuffers)
+onnxruntime_add_include_to_target(onnxruntime_flatbuffers onnx flatbuffers ${GSL_TARGET})
 if(onnxruntime_ENABLE_INSTRUMENT)
   target_compile_definitions(onnxruntime_flatbuffers PUBLIC ONNXRUNTIME_ENABLE_INSTRUMENT)
 endif()
@@ -41,4 +41,4 @@ namespace std { using ::getenv; }
 ]])
   target_compile_options(flatbuffers PRIVATE /FI${CMAKE_BINARY_DIR}/gdk_cstdlib_wrapper.h)
   target_compile_options(flatc PRIVATE /FI${CMAKE_BINARY_DIR}/gdk_cstdlib_wrapper.h)
-endif()
\ No newline at end of file
+endif()
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index e2c297c0cb..125d0a07f3 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -502,6 +502,7 @@ endif()
 
 foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
     target_include_directories(${mlas_target} PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${MLAS_SRC_DIR})
+    onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})
 endforeach()
 set_target_properties(onnxruntime_mlas PROPERTIES FOLDER "ONNXRuntime")
 if (WIN32)
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index f9505d6b54..40f6208e70 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -550,7 +550,7 @@ if (onnxruntime_USE_DNNL)
   add_dependencies(onnxruntime_providers_dnnl onnxruntime_providers_shared project_dnnl ${onnxruntime_EXTERNAL_DEPENDENCIES})
   target_include_directories(onnxruntime_providers_dnnl PRIVATE ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS} ${DNNL_INCLUDE_DIR} ${DNNL_OCL_INCLUDE_DIR})
   # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
-  target_link_libraries(onnxruntime_providers_dnnl PRIVATE dnnl ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 ${ABSEIL_LIBS})
+  target_link_libraries(onnxruntime_providers_dnnl PRIVATE dnnl ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 ${ABSEIL_LIBS} ${GSL_TARGET})
   install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/dnnl  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
   set_target_properties(onnxruntime_providers_dnnl PROPERTIES FOLDER "ONNXRuntime")
   set_target_properties(onnxruntime_providers_dnnl PROPERTIES LINKER_LANGUAGE CXX)
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 3708e1291d..3f82ba9df8 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1097,6 +1097,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
   if (onnxruntime_BUILD_SHARED_LIB)
     onnxruntime_add_static_library(onnxruntime_mocked_allocator ${TEST_SRC_DIR}/util/test_allocator.cc)
     target_include_directories(onnxruntime_mocked_allocator PUBLIC ${TEST_SRC_DIR}/util/include)
+    target_link_libraries(onnxruntime_mocked_allocator PRIVATE ${GSL_TARGET})
     set_target_properties(onnxruntime_mocked_allocator PROPERTIES FOLDER "ONNXRuntimeTest")
 
     #################################################################
@@ -1253,6 +1254,7 @@ else()
   onnxruntime_add_shared_library_module(custom_op_library ${TEST_SRC_DIR}/testdata/custom_op_library/custom_op_library.cc)
 endif()
 target_include_directories(custom_op_library PRIVATE ${REPO_ROOT}/include)
+target_link_libraries(custom_op_library PRIVATE ${GSL_TARGET})
 if(UNIX)
   if (APPLE)
     set(ONNXRUNTIME_CUSTOM_OP_LIB_LINK_FLAG "-Xlinker -dead_strip")
diff --git a/cmake/winml.cmake b/cmake/winml.cmake
index 056c3cbb3a..d74cbc0aab 100644
--- a/cmake/winml.cmake
+++ b/cmake/winml.cmake
@@ -187,6 +187,7 @@ target_include_directories(winml_lib_telemetry PRIVATE ${winml_lib_telemetry_dir
 target_include_directories(winml_lib_telemetry PRIVATE ${winml_lib_common_dir}/inc)
 target_include_directories(winml_lib_telemetry PRIVATE ${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows)
 target_include_directories(winml_lib_telemetry PRIVATE ${REPO_ROOT}/winml)
+target_include_directories(winml_lib_telemetry PRIVATE ${GSL_INCLUDE_DIR})
 
 # Properties
 set_target_properties(winml_lib_telemetry
@@ -264,6 +265,7 @@ target_include_directories(winml_lib_ort PRIVATE ${winml_lib_api_ort_dir})
 target_include_directories(winml_lib_ort PRIVATE ${winml_lib_common_dir}/inc)
 target_include_directories(winml_lib_ort PRIVATE ${ONNXRUNTIME_INCLUDE_DIR})
 target_include_directories(winml_lib_ort PRIVATE ${ONNXRUNTIME_ROOT})
+target_include_directories(winml_lib_ort PRIVATE ${GSL_INCLUDE_DIR})
 
 set_target_properties(winml_lib_ort
   PROPERTIES
@@ -403,13 +405,13 @@ target_include_directories(winml_lib_image PRIVATE ${winml_lib_api_image_dir})
 target_include_directories(winml_lib_image PRIVATE ${winml_lib_common_dir}/inc)
 target_include_directories(winml_lib_image PRIVATE ${ONNXRUNTIME_ROOT})
 target_include_directories(winml_lib_image PRIVATE ${ONNXRUNTIME_INCLUDE_DIR})                                                        # for status.h
-target_include_directories(winml_lib_image PRIVATE ${REPO_ROOT}/cmake/external/gsl/include)
 target_include_directories(winml_lib_image PRIVATE ${REPO_ROOT}/cmake/external/onnx)
 target_include_directories(winml_lib_image PRIVATE ${REPO_ROOT}/cmake/external/protobuf/src)
 target_include_directories(winml_lib_image PRIVATE ${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows)
 target_include_directories(winml_lib_image PRIVATE ${REPO_ROOT}/cmake/external/flatbuffers/include)
 target_include_directories(winml_lib_image PRIVATE ${REPO_ROOT}/cmake/external/mp11/include)
 target_include_directories(winml_lib_image PRIVATE ${REPO_ROOT}/winml)
+target_include_directories(winml_lib_image PRIVATE ${GSL_INCLUDE_DIR})
 
 # Properties
 set_target_properties(winml_lib_image
@@ -511,7 +513,6 @@ target_include_directories(winml_lib_api PRIVATE ${winml_lib_common_dir}/inc)
 
 target_include_directories(winml_lib_api PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 target_include_directories(winml_lib_api PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/date/include)
-target_include_directories(winml_lib_api PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/gsl/include)
 target_include_directories(winml_lib_api PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/onnx)
 
 target_include_directories(winml_lib_api PRIVATE ${ONNXRUNTIME_INCLUDE_DIR})
@@ -521,11 +522,11 @@ target_include_directories(winml_lib_api PRIVATE ${ONNXRUNTIME_ROOT}/core/graph)
 target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/eigen)
 target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/onnx)
 target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/protobuf/src)
-target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/gsl/include)
 target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/SafeInt)
 target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/flatbuffers/include)
 target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/mp11/include)
 target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/winml)
+target_include_directories(winml_lib_api PRIVATE ${GSL_INCLUDE_DIR})
 
 # Properties
 set_target_properties(winml_lib_api
@@ -606,7 +607,6 @@ target_include_directories(winml_lib_api_experimental PRIVATE ${winml_lib_common
 
 target_include_directories(winml_lib_api_experimental PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 target_include_directories(winml_lib_api_experimental PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/date/include)
-target_include_directories(winml_lib_api_experimental PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/gsl/include)
 target_include_directories(winml_lib_api_experimental PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/onnx)
 
 target_include_directories(winml_lib_api_experimental PRIVATE ${ONNXRUNTIME_INCLUDE_DIR})
@@ -616,11 +616,11 @@ target_include_directories(winml_lib_api_experimental PRIVATE ${ONNXRUNTIME_ROOT
 target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/eigen)
 target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/onnx)
 target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/protobuf/src)
-target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/gsl/include)
 target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/SafeInt)
 target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/flatbuffers/include)
 target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/mp11/include)
 target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/winml)
+target_include_directories(winml_lib_api_experimental PRIVATE ${GSL_INCLUDE_DIR})
 
 # Properties
 set_target_properties(winml_lib_api_experimental
@@ -692,8 +692,15 @@ target_include_directories(winml_lib_common PRIVATE ${winml_lib_api_dir})
 target_include_directories(winml_lib_common PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 target_include_directories(winml_lib_common PRIVATE ${winml_lib_common_dir}/inc)
 target_include_directories(winml_lib_common PRIVATE ${REPO_ROOT}/winml)
+target_include_directories(winml_lib_common PRIVATE ${GSL_INCLUDE_DIR})
 target_precompiled_header(winml_lib_common lib/Common/inc/pch.h)
 
+# Properties
+set_target_properties(winml_lib_common
+  PROPERTIES
+  FOLDER
+  ${target_folder})
+
 if (onnxruntime_USE_DML)
   target_add_dml(winml_lib_common)
 endif()
@@ -762,7 +769,6 @@ target_include_directories(winml_dll PRIVATE ${winml_lib_common_dir}/inc)
 
 target_include_directories(winml_dll PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 target_include_directories(winml_dll PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/date/include)
-target_include_directories(winml_dll PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/gsl/include)
 target_include_directories(winml_dll PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/external/onnx)
 
 target_include_directories(winml_dll PRIVATE ${ONNXRUNTIME_INCLUDE_DIR})
@@ -771,12 +777,12 @@ target_include_directories(winml_dll PRIVATE ${ONNXRUNTIME_ROOT})
 target_include_directories(winml_dll PRIVATE ${ONNXRUNTIME_ROOT}/core/graph)
 target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/onnx)
 target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/protobuf/src)
-target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/gsl/include)
 target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/eigen)
 target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/SafeInt)
 target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/flatbuffers/include)
 target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/mp11/include)
 target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/winml)
+target_include_directories(winml_dll PRIVATE ${GSL_INCLUDE_DIR})
 
 # Properties
 set_target_properties(winml_dll
diff --git a/cmake/winml_unittests.cmake b/cmake/winml_unittests.cmake
index 6e14591224..74f232ea15 100644
--- a/cmake/winml_unittests.cmake
+++ b/cmake/winml_unittests.cmake
@@ -183,7 +183,7 @@ add_dependencies(winml_test_common
   winml_api
   winml_dll
 )
-onnxruntime_add_include_to_target(winml_test_common onnx_proto)
+onnxruntime_add_include_to_target(winml_test_common onnx_proto ${GSL_TARGET})
 onnxruntime_add_static_library(winml_google_test_lib ${WINML_TEST_SRC_DIR}/common/googletest/main.cpp)
 set_winml_target_properties(winml_google_test_lib)
 
diff --git a/include/onnxruntime/core/common/common.h b/include/onnxruntime/core/common/common.h
index 7eed4737cd..44b467cc0a 100644
--- a/include/onnxruntime/core/common/common.h
+++ b/include/onnxruntime/core/common/common.h
@@ -36,7 +36,6 @@
 #include "core/common/exceptions.h"
 #include "core/common/make_string.h"
 #include "core/common/status.h"
-#include "core/common/gsl_suppress.h"
 
 
 namespace onnxruntime {
diff --git a/include/onnxruntime/core/common/gsl.h b/include/onnxruntime/core/common/gsl.h
new file mode 100644
index 0000000000..371c5b7543
--- /dev/null
+++ b/include/onnxruntime/core/common/gsl.h
@@ -0,0 +1,6 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "gsl/gsl"
diff --git a/include/onnxruntime/core/common/gsl_suppress.h b/include/onnxruntime/core/common/gsl_suppress.h
deleted file mode 100644
index 66702d0424..0000000000
--- a/include/onnxruntime/core/common/gsl_suppress.h
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-#pragma once
-
-#ifndef GSL_SUPPRESS
-#if defined(__clang__) && !defined(__NVCC__)
-#define GSL_SUPPRESS(x) [[gsl::suppress("x")]]
-#else
-#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__NVCC__)
-#define GSL_SUPPRESS(x) [[gsl::suppress(x)]]
-#else
-#define GSL_SUPPRESS(x)
-#endif  // _MSC_VER
-#endif  // __clang__
-#endif
\ No newline at end of file
diff --git a/include/onnxruntime/core/common/logging/capture.h b/include/onnxruntime/core/common/logging/capture.h
index 4f71bb3302..811744d2c0 100644
--- a/include/onnxruntime/core/common/logging/capture.h
+++ b/include/onnxruntime/core/common/logging/capture.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include <cstdarg>
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 #include "core/common/common.h"
 #include "core/common/code_location.h"
 #include "core/common/logging/severity.h"
diff --git a/include/onnxruntime/core/common/narrow.h b/include/onnxruntime/core/common/narrow.h
new file mode 100644
index 0000000000..15bcf167db
--- /dev/null
+++ b/include/onnxruntime/core/common/narrow.h
@@ -0,0 +1,77 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+// onnxruntime::narrow() is like gsl::narrow() but it is also available when exceptions are disabled.
+
+#if !defined(ORT_NO_EXCEPTIONS)
+
+#include "gsl/narrow"
+
+namespace onnxruntime {
+using gsl::narrow;
+}  // namespace onnxruntime
+
+#else  // ^^ !defined(ORT_NO_EXCEPTIONS) ^^ / vv defined(ORT_NO_EXCEPTIONS) vv
+
+#include <cstdio>     // std::fprintf
+#include <exception>  // std::terminate
+#include <type_traits>
+
+#include "gsl/util"  // gsl::narrow_cast
+
+namespace onnxruntime {
+
+namespace detail {
+[[noreturn]] inline void OnNarrowingError() noexcept {
+  std::fprintf(stderr, "%s", "narrowing error\n");
+  std::terminate();
+}
+}  // namespace detail
+
+// This implementation of onnxruntime::narrow was copied and adapted from:
+// https://github.com/microsoft/GSL/blob/a3534567187d2edc428efd3f13466ff75fe5805c/include/gsl/narrow
+
+// narrow() : a checked version of narrow_cast() that terminates if the cast changed the value
+template <class T, class U, typename std::enable_if<std::is_arithmetic<T>::value>::type* = nullptr>
+// clang-format off
+GSL_SUPPRESS(type.1) // NO-FORMAT: attribute
+                      // clang-format on
+    constexpr T narrow(U u) noexcept {
+  constexpr const bool is_different_signedness =
+      (std::is_signed<T>::value != std::is_signed<U>::value);
+
+  // clang-format off
+GSL_SUPPRESS(es.103) // NO-FORMAT: attribute // don't overflow
+GSL_SUPPRESS(es.104) // NO-FORMAT: attribute // don't underflow
+GSL_SUPPRESS(p.2) // NO-FORMAT: attribute // don't rely on undefined behavior
+  // clang-format on
+  const T t = gsl::narrow_cast<T>(u);  // While this is technically undefined behavior in some cases (i.e., if the source value is of floating-point type
+                                       // and cannot fit into the destination integral type), the resultant behavior is benign on the platforms
+                                       // that we target (i.e., no hardware trap representations are hit).
+
+  if (static_cast<U>(t) != u || (is_different_signedness && ((t < T{}) != (u < U{})))) {
+    detail::OnNarrowingError();
+  }
+
+  return t;
+}
+
+template <class T, class U, typename std::enable_if<!std::is_arithmetic<T>::value>::type* = nullptr>
+// clang-format off
+GSL_SUPPRESS(type.1) // NO-FORMAT: attribute
+                      // clang-format on
+    constexpr T narrow(U u) noexcept {
+  const T t = gsl::narrow_cast<T>(u);
+
+  if (static_cast<U>(t) != u) {
+    detail::OnNarrowingError();
+  }
+
+  return t;
+}
+
+}  // namespace onnxruntime
+
+#endif  // defined(ORT_NO_EXCEPTIONS)
diff --git a/include/onnxruntime/core/common/span_utils.h b/include/onnxruntime/core/common/span_utils.h
index 8247cc1394..82aa59f0d0 100644
--- a/include/onnxruntime/core/common/span_utils.h
+++ b/include/onnxruntime/core/common/span_utils.h
@@ -3,23 +3,26 @@
 
 #pragma once
 
-#include <gsl/gsl>
+#include <algorithm>
+
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
-// Inspired by Fekir's Blog https://fekir.info/post/span-the-missing-constructor/
+
+// AsSpan inspired by Fekir's Blog https://fekir.info/post/span-the-missing-constructor/
 // Used under MIT license
 
 // Use AsSpan for less typing on any container including initializer list to create a span
 // (unnamed, untyped initializer list does not automatically convert to gsl::span).
-// {1, 2, 3} as such does not have a type 
+// {1, 2, 3} as such does not have a type
 // (see https://scottmeyers.blogspot.com/2014/03/if-braced-initializers-have-no-type-why.html)
-// 
+//
 //   Example: AsSpan({1, 2, 3}) results in gsl::span<const int>
-// 
+//
 // The above would deduce to std::initializer_list<int> and the result is gsl::span<const int>
 //
 // AsSpan<int64_t>({1, 2, 3}) produces gsl::span<const int64_t>
-// 
+//
 // We can also do std::array<int64_t, 3>{1, 2, 3} that can be automatically converted to span
 // without memory allocation.
 //
@@ -38,7 +41,7 @@ template <class C>
 constexpr auto AsSpan(C& c) {
   return details::AsSpanImpl(c.data(), c.size());
 }
- 
+
 template <class C>
 constexpr auto AsSpan(const C& c) {
   return details::AsSpanImpl(c.data(), c.size());
@@ -64,7 +67,22 @@ constexpr auto AsSpan(const T (&arr)[N]) {
   return details::AsSpanImpl(arr, N);
 }
 
-template<class T>
+template <class T>
 inline gsl::span<const T> EmptySpan() { return gsl::span<const T>(); }
 
-}
\ No newline at end of file
+template <class U, class T>
+[[nodiscard]] inline gsl::span<U> ReinterpretAsSpan(gsl::span<T> src) {
+  // adapted from gsl-lite span::as_span():
+  // https://github.com/gsl-lite/gsl-lite/blob/4720a2980a30da085b4ddb4a0ea2a71af7351a48/include/gsl/gsl-lite.hpp#L4102-L4108
+  Expects(src.size_bytes() % sizeof(U) == 0);
+  return gsl::span<U>(reinterpret_cast<U*>(src.data()), src.size_bytes() / sizeof(U));
+}
+
+template <class T1, size_t Extent1, class T2, size_t Extent2>
+[[nodiscard]] inline bool SpanEq(gsl::span<T1, Extent1> a, gsl::span<T2, Extent2> b) {
+  static_assert(std::is_same_v<std::remove_const_t<T1>, std::remove_const_t<T2>>,
+                "T1 and T2 should be the same type except for const qualification");
+  return std::equal(a.begin(), a.end(), b.begin(), b.end());
+}
+
+}  // namespace onnxruntime
diff --git a/include/onnxruntime/core/common/status.h b/include/onnxruntime/core/common/status.h
index bcf058886e..d6e1992944 100644
--- a/include/onnxruntime/core/common/status.h
+++ b/include/onnxruntime/core/common/status.h
@@ -19,7 +19,7 @@ limitations under the License.
 #ifdef _WIN32
 #include <winerror.h>
 #endif
-#include "core/common/gsl_suppress.h"
+#include "core/common/gsl.h"
 namespace onnxruntime {
 namespace common {
 
diff --git a/include/onnxruntime/core/framework/data_types.h b/include/onnxruntime/core/framework/data_types.h
index f4ca87eb9a..6dc38c2719 100644
--- a/include/onnxruntime/core/framework/data_types.h
+++ b/include/onnxruntime/core/framework/data_types.h
@@ -9,7 +9,7 @@
 #include <type_traits>
 #include <map>
 #include <unordered_map>
-#include "core/common/gsl_suppress.h"
+#include "core/common/gsl.h"
 #include "core/common/common.h"
 #include "core/common/exceptions.h"
 #include "core/framework/endian.h"
diff --git a/include/onnxruntime/core/framework/float16.h b/include/onnxruntime/core/framework/float16.h
index 04a5a9c97b..598f3748af 100644
--- a/include/onnxruntime/core/framework/float16.h
+++ b/include/onnxruntime/core/framework/float16.h
@@ -8,7 +8,7 @@
 #endif
 
 #if !defined(__CUDACC__) && !defined(__HIPCC__)
-#include <gsl/gsl>
+#include "core/common/narrow.h"
 #endif
 
 #include "core/common/common.h"
@@ -123,7 +123,7 @@ inline ORT_HOST_DEVICE bool operator<(const BFloat16& left, const BFloat16& righ
 // E.g 10_f16 or 10_b16
 #if !defined(__CUDACC__) && !defined(__HIPCC__)
 inline MLFloat16 operator"" _f16(unsigned long long int v) {
-  return MLFloat16(gsl::narrow<uint16_t>(v));
+  return MLFloat16(narrow<uint16_t>(v));
 }
 
 inline MLFloat16 operator"" _fp16(long double v) {
@@ -131,7 +131,7 @@ inline MLFloat16 operator"" _fp16(long double v) {
 }
 
 inline BFloat16 operator"" _b16(unsigned long long int v) {
-  return BFloat16(gsl::narrow<uint16_t>(v), BFloat16::FromBits());
+  return BFloat16(narrow<uint16_t>(v), BFloat16::FromBits());
 }
 
 inline BFloat16 operator"" _bfp16(long double v) {
diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h
index 0f5425315b..0008b0de1d 100644
--- a/include/onnxruntime/core/framework/op_kernel.h
+++ b/include/onnxruntime/core/framework/op_kernel.h
@@ -30,7 +30,7 @@
 #endif
 #include "onnx/onnx_pb.h"
 #include "onnx/onnx-operators_pb.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 namespace onnxruntime {
 class OpKernelContext;
 }
diff --git a/include/onnxruntime/core/framework/op_kernel_info.h b/include/onnxruntime/core/framework/op_kernel_info.h
index dca4df8192..5e5487dc69 100644
--- a/include/onnxruntime/core/framework/op_kernel_info.h
+++ b/include/onnxruntime/core/framework/op_kernel_info.h
@@ -8,7 +8,7 @@
 #include "core/framework/ort_value.h"
 #include "core/framework/op_node_proto_helper.h"
 #include "core/graph/graph_viewer.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
 
diff --git a/include/onnxruntime/core/framework/op_node_proto_helper.h b/include/onnxruntime/core/framework/op_node_proto_helper.h
index 41250bd42c..700e1edc0c 100644
--- a/include/onnxruntime/core/framework/op_node_proto_helper.h
+++ b/include/onnxruntime/core/framework/op_node_proto_helper.h
@@ -7,7 +7,7 @@
 #include "core/common/status.h"
 #include "core/framework/tensor_shape.h"
 #include "core/graph/graph_viewer.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #endif
 
 #ifdef __has_attribute
diff --git a/include/onnxruntime/core/framework/tensor.h b/include/onnxruntime/core/framework/tensor.h
index 4ca4777c89..5ec9b788ca 100644
--- a/include/onnxruntime/core/framework/tensor.h
+++ b/include/onnxruntime/core/framework/tensor.h
@@ -8,7 +8,7 @@
 #include <string>
 #include <vector>
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "core/common/common.h"
 #include "core/framework/allocator.h"
 #include "core/framework/tensor_shape.h"
@@ -189,7 +189,7 @@ class Tensor final {
     ORT_ENFORCE(utils::IsPrimitiveDataType<T>(dtype_), "Tensor type mismatch. ",
                 "T ", "!=", dtype_);
     const T* data = reinterpret_cast<const T*>(static_cast<char*>(p_data_) + byte_offset_);
-    return gsl::make_span(data, static_cast<typename gsl::span<T>::index_type>(shape_.Size()));
+    return gsl::make_span(data, static_cast<typename gsl::span<T>::size_type>(shape_.Size()));
   }
 
   void* MutableDataRaw(MLDataType type) {
diff --git a/include/onnxruntime/core/framework/tensor_shape.h b/include/onnxruntime/core/framework/tensor_shape.h
index 645d7d8079..b3783696b8 100644
--- a/include/onnxruntime/core/framework/tensor_shape.h
+++ b/include/onnxruntime/core/framework/tensor_shape.h
@@ -7,7 +7,7 @@
 #include <algorithm>
 #include <string>
 #include <cstring>
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 #include "onnxruntime_config.h"
 
 #ifndef DISABLE_ABSEIL
@@ -29,6 +29,8 @@
 #endif
 #endif  // DISABLE_ABSEIL
 
+#include "core/common/span_utils.h"
+
 namespace onnxruntime {
 #ifdef __GNUC__
 #pragma GCC diagnostic push
@@ -96,7 +98,7 @@ class TensorShape {
   int64_t operator[](size_t idx) const { return values_[idx]; }
   int64_t& operator[](size_t idx) { return values_[idx]; }
 
-  bool operator==(const TensorShape& other) const noexcept { return GetDims() == other.GetDims(); }
+  bool operator==(const TensorShape& other) const noexcept { return SpanEq(GetDims(), other.GetDims()); }
   bool operator!=(const TensorShape& other) const noexcept { return !(*this == other); }
 
   size_t NumDimensions() const noexcept {
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index 7dda32de41..ef5cb20913 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -29,12 +29,13 @@
 #pragma warning(pop)
 #endif
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "core/common/common.h"
 #include "core/common/const_pointer_container.h"
 #include "core/common/inlined_containers_fwd.h"
 #include "core/common/path.h"
+#include "core/common/span_utils.h"
 #include "core/common/status.h"
 #include "core/common/logging/logging.h"
 #include "core/graph/basic_types.h"
@@ -935,8 +936,8 @@ class Graph {
                 const NodeAttributes* attributes = nullptr,
                 const std::string& domain = kOnnxDomain) {
     return AddNode(name, op_type, description,
-                   gsl::make_span(input_args.begin(), input_args.end()),
-                   gsl::make_span(output_args.begin(), output_args.end()),
+                   AsSpan(input_args),
+                   AsSpan(output_args),
                    attributes, domain);
   }
 
@@ -949,7 +950,7 @@ class Graph {
                 const std::string& domain = kOnnxDomain) {
     return AddNode(name, op_type, description,
                    input_args,
-                   gsl::make_span(output_args.begin(), output_args.end()),
+                   AsSpan(output_args),
                    attributes, domain);
   }
 
@@ -961,7 +962,7 @@ class Graph {
                 const NodeAttributes* attributes = nullptr,
                 const std::string& domain = kOnnxDomain) {
     return AddNode(name, op_type, description,
-                   gsl::make_span(input_args.begin(), input_args.end()),
+                   AsSpan(input_args),
                    output_args,
                    attributes, domain);
   }
@@ -1153,7 +1154,7 @@ class Graph {
   void SetInputs(gsl::span<const NodeArg* const> inputs);
 
   void SetInputs(std::initializer_list<const NodeArg*> inputs) {
-    SetInputs(gsl::make_span(inputs));
+    SetInputs(AsSpan(inputs));
   }
 
   const Model& GetModel() const {
@@ -1171,7 +1172,7 @@ class Graph {
   void SetOutputs(gsl::span<const NodeArg* const> outputs);
 
   void SetOutputs(std::initializer_list<const NodeArg*> outputs) {
-    SetOutputs(gsl::make_span(outputs.begin(), outputs.end()));
+    SetOutputs(AsSpan(outputs));
   }
 
 #endif  // !defined(ORT_MINIMAL_BUILD)
@@ -1232,7 +1233,7 @@ class Graph {
   }
 
   void UpdateConsumerNodes(const std::string& node_arg_name, std::initializer_list<Node*> nodes) {
-    UpdateConsumerNodes(node_arg_name, gsl::make_span(nodes));
+    UpdateConsumerNodes(node_arg_name, AsSpan(nodes));
   }
 
   /** During constant folding it may become possible to infer the shape for a node.
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index ac81d00e7b..086aa1e4e8 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -129,7 +129,6 @@ extern "C" {
 
 // Used in *.cc files. Almost as same as ORT_API_STATUS, except without ORT_MUST_USE_RESULT and ORT_EXPORT
 #define ORT_API_STATUS_IMPL(NAME, ...) \
-  GSL_SUPPRESS(r .11)                  \
   _Success_(return == 0) _Check_return_ _Ret_maybenull_ OrtStatusPtr ORT_API_CALL NAME(__VA_ARGS__) NO_EXCEPTION
 
 #define ORT_CLASS_RELEASE(X) void(ORT_API_CALL * Release##X)(_Frees_ptr_opt_ Ort##X * input)
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/attention_mechanism.h b/onnxruntime/contrib_ops/cpu/attnlstm/attention_mechanism.h
index 4e67f36db6..3c45959e0f 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/attention_mechanism.h
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/attention_mechanism.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
 namespace contrib {
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc b/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc
index 54ffc62670..efa1db1379 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc
@@ -51,7 +51,7 @@ void AttentionWrapper<T>::ProcessOutput(const gsl::span<const T>& rnn_cell_outpu
   // Get the context which is calculated within attention mechanism.
   attention_mechanism_.Compute(rnn_cell_output, prev_alignments_, attn_context_, alignments_);
   if (attention_mechanism_.NeedPrevAlignment()) {
-    std::copy(alignments_.cbegin(), alignments_.cend(), prev_alignments_.begin());
+    std::copy(alignments_.begin(), alignments_.end(), prev_alignments_.begin());
   }
 
   if (has_attn_layer_) {
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc b/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc
index 6398ff2852..49ed5ccf6a 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc
@@ -63,11 +63,11 @@ template <typename T>
 void BahdanauAttention<T>::PrepareMemory(
     const gsl::span<const T>& memory,
     const gsl::span<const int>& memory_sequence_lengths) {
-  std::copy(memory.cbegin(), memory.cend(), values_.begin());
+  std::copy(memory.begin(), memory.end(), values_.begin());
   if (memory_sequence_lengths.empty()) {
     std::fill(mem_seq_lengths_.begin(), mem_seq_lengths_.end(), max_memory_steps_);
   } else {
-    std::copy(memory_sequence_lengths.cbegin(), memory_sequence_lengths.cend(), mem_seq_lengths_.begin());
+    std::copy(memory_sequence_lengths.begin(), memory_sequence_lengths.end(), mem_seq_lengths_.begin());
   }
 
   for (int b = 0; b < batch_size_; b++) {
@@ -145,7 +145,7 @@ void BahdanauAttention<T>::Compute(
       }
     }
 
-    SoftmaxInplace(gsl::span<T>{alignments, gsl::narrow_cast<gsl::index>(mem_steps)});
+    SoftmaxInplace(gsl::span<T>{alignments, gsl::narrow_cast<size_t>(mem_steps)});
 
     // Calculate the context
     auto outspan = output.subspan(b * memory_depth_);
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc
index 02a09e88db..96cae6d298 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc
@@ -8,6 +8,7 @@
 
 #include "core/common/common.h"
 #include "core/common/logging/logging.h"
+#include "core/common/narrow.h"
 #include "core/platform/threadpool.h"
 #include "core/framework/allocator.h"
 //TODO: fix the warnings
@@ -95,9 +96,9 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
 
   auto& X_shape = X.Shape();
 
-  int seq_length = gsl::narrow<int>(X_shape[0]);
-  int batch_size = gsl::narrow<int>(X_shape[1]);
-  int input_size = gsl::narrow<int>(X_shape[2]);
+  int seq_length = narrow<int>(X_shape[0]);
+  int batch_size = narrow<int>(X_shape[1]);
+  int input_size = narrow<int>(X_shape[2]);
 
   // Processing attention wrapper
   constexpr int first_attn_input = 8;
@@ -113,12 +114,12 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
       am_query_layer_weights, am_memory_layer_weights, am_v_weights, attn_memory, attn_memory_seq_lens, attn_layer_weights);
   ORT_RETURN_IF_ERROR(status);
 
-  const int max_memory_step = gsl::narrow<int>(attn_memory.Shape()[1]);
-  const int memory_depth = gsl::narrow<int>(am_memory_layer_weights.Shape()[1]);
-  const int am_attn_size = gsl::narrow<int>(am_memory_layer_weights.Shape()[2]);
-  const int query_depth = gsl::narrow<int>(am_query_layer_weights.Shape()[1]);  // it is equal to hidden_size
+  const int max_memory_step = narrow<int>(attn_memory.Shape()[1]);
+  const int memory_depth = narrow<int>(am_memory_layer_weights.Shape()[1]);
+  const int am_attn_size = narrow<int>(am_memory_layer_weights.Shape()[2]);
+  const int query_depth = narrow<int>(am_query_layer_weights.Shape()[1]);  // it is equal to hidden_size
   const bool has_attention_layer = attn_layer_weights != nullptr;
-  const int attn_layer_depth = has_attention_layer ? gsl::narrow<int>(attn_layer_weights->Shape()[2]) : 0;
+  const int attn_layer_depth = has_attention_layer ? narrow<int>(attn_layer_weights->Shape()[2]) : 0;
   const int attention_size = has_attention_layer ? attn_layer_depth : memory_depth;
 
   const gsl::span<const T> attn_layer_weights_span = (has_attention_layer) ? attn_layer_weights->DataAsSpan<T>() : gsl::span<const T>();
@@ -202,7 +203,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
 
   if (!output.empty() && !sequence_lens_span.empty()) {
     // clear tailing outputs
-    int32_t max_seq_this_batch = *std::max_element(sequence_lens_span.cbegin(), sequence_lens_span.cend());
+    int32_t max_seq_this_batch = *std::max_element(sequence_lens_span.begin(), sequence_lens_span.end());
     if (max_seq_this_batch >= 0 && max_seq_this_batch < seq_length) {
       auto start = max_seq_this_batch * hidden_output_size_per_direction * num_directions_;
       std::fill(output.begin() + start, output.end(), T{});
@@ -424,8 +425,8 @@ static Status ValidateRnnInputsWithExtraInputFromState(
     }
 
     auto sequence_len_entries = sequence_lens->DataAsSpan<int>();
-    if (std::any_of(sequence_len_entries.cbegin(),
-                    sequence_len_entries.cend(),
+    if (std::any_of(sequence_len_entries.begin(),
+                    sequence_len_entries.end(),
                     [seq_length](int len) { return len <= 0 || len > seq_length; })) {
       return ORT_MAKE_STATUS(
           ONNXRUNTIME, INVALID_ARGUMENT,
@@ -461,8 +462,8 @@ Status DeepCpuAttnLstmOp::ValidateInputs(
                            "Attention mechanism memory shape error! Expected: {", batch_size,
                            "}, actural: ", memory_shape);
   }
-  const int max_memory_step = gsl::narrow<int>(memory_shape[1]);
-  const int memory_depth = gsl::narrow<int>(memory_shape[2]);
+  const int max_memory_step = narrow<int>(memory_shape[1]);
+  const int memory_depth = narrow<int>(memory_shape[2]);
   if (attn_memory_seq_lens != nullptr) {
     auto memory_seq_lens_shape = attn_memory_seq_lens->Shape();
     if (memory_seq_lens_shape.NumDimensions() != 1 || memory_seq_lens_shape[0] != batch_size) {
@@ -472,9 +473,9 @@ Status DeepCpuAttnLstmOp::ValidateInputs(
     }
     const gsl::span<const int> mem_seq_lens_span = attn_memory_seq_lens->DataAsSpan<int>();
     auto item_not_in_range = std::find_if(
-        mem_seq_lens_span.cbegin(), mem_seq_lens_span.cend(),
+        mem_seq_lens_span.begin(), mem_seq_lens_span.end(),
         [max_memory_step](int len) { return len <= 0 || len > max_memory_step; });
-    if (item_not_in_range != mem_seq_lens_span.cend()) {
+    if (item_not_in_range != mem_seq_lens_span.end()) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "Attention mechanism memory sequence lengths value must in (0, ",
                              max_memory_step, "], while ", *item_not_in_range, " found!");
@@ -490,7 +491,7 @@ Status DeepCpuAttnLstmOp::ValidateInputs(
                            "Attention memory layer weight shape error! Expected:{",
                            num_directions_, ",", memory_depth, ", am_attn_size}, Got:", memory_layer_shape);
   }
-  const int am_attn_size = gsl::narrow<int>(memory_layer_shape[2]);
+  const int am_attn_size = narrow<int>(memory_layer_shape[2]);
 
   // check query layer weights of [num_directions, query_depth(hidden_size of lstm), am_attn_size]
   auto query_layer_shape = am_query_layer_weights.Shape();
@@ -525,7 +526,7 @@ Status DeepCpuAttnLstmOp::ValidateInputs(
                              "Attention layer weight shape error! Expected: {", num_directions_, ", ",
                              memory_depth + hidden_size_, ", aw_attn_size}. Got:", attn_layer_shape);
     }
-    aw_attn_size = gsl::narrow<int>(attn_layer_shape[2]);
+    aw_attn_size = narrow<int>(attn_layer_shape[2]);
   }
 
   auto status = ValidateRnnInputsWithExtraInputFromState(
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h
index c881a1efde..326b2d8dc4 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h
@@ -7,6 +7,7 @@
 
 #include "attention_wrapper.h"
 
+#include "core/common/narrow.h"
 #include "core/framework/op_kernel.h"
 #include "core/providers/cpu/rnn/rnn_helpers.h"
 
@@ -30,7 +31,7 @@ class DeepCpuAttnLstmOp final : public OpKernel {
 
     int64_t int64_value;
     ORT_ENFORCE(info.GetAttr("hidden_size", &int64_value).IsOK() && int64_value > 0);
-    hidden_size_ = gsl::narrow<int>(int64_value);
+    hidden_size_ = narrow<int>(int64_value);
 
     // optional attributes
     std::vector<std::string> activation_func_names = info.GetAttrsOrDefault<std::string>("activations");
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc
index a408a8f95c..d8e1ab211c 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc
@@ -162,7 +162,7 @@ void UniDirectionalAttnLstm<T>::LoadPeepholeWeights(const gsl::span<const T>& pe
   DumpMatrix("P[f]", peephole_weights.data() + (i++ * hidden_size_), 1, hidden_size_);
 
   auto copy_weight = [this, &peephole_weights](int offset, gsl::span<T>& out) {
-    typename gsl::span<const T>::const_iterator in_iter = peephole_weights.cbegin() + offset;
+    typename gsl::span<const T>::iterator in_iter = peephole_weights.begin() + offset;
     std::copy(in_iter, in_iter + hidden_size_, out.begin());
   };
 
@@ -245,9 +245,9 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
   }
 
   // Calculate the max and min length
-  int32_t max_sequence_length = *std::max_element(sequence_lengths.cbegin(), sequence_lengths.cend());
-  int32_t min_sequence_length = std::min(seq_length_, *std::min_element(sequence_lengths.cbegin(),
-                                                                        sequence_lengths.cend()));
+  int32_t max_sequence_length = *std::max_element(sequence_lengths.begin(), sequence_lengths.end());
+  int32_t min_sequence_length = std::min(seq_length_, *std::min_element(sequence_lengths.begin(),
+                                                                        sequence_lengths.end()));
 
   ///**************************LSTM Calculations****************************/
   const int hidden_size_x4 = 4 * hidden_size_;
@@ -255,9 +255,9 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
 
   // apply the weights to all the inputs and save to output_IOFC
   ComputeGemm(total_rows, hidden_size_x4, input_size_, T{1.0},
-              inputs.cbegin(), inputs.cend(),
+              inputs.begin(), inputs.end(),
               input_size_,
-              input_weights.cbegin(), input_weights.cend(),  // W[iofc]^T
+              input_weights.begin(), input_weights.end(),  // W[iofc]^T
               input_size_ + attention_size_, T{0.0},
               output_iofc_.begin(), output_iofc_.end(),
               hidden_size_x4, ttp_);
@@ -278,7 +278,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
 
     // hidden state can be provided as input for first step, so need to special case that.
     // after the first step this will switch to the output from the previous step
-    span_T_const_iter previous_state = batched_hidden_state_one_step.cbegin();
+    span_T_const_iter previous_state = batched_hidden_state_one_step.begin();
 
     //run through steps sequentially
     for (int step = 0; step < max_sequence_length; step++) {
@@ -293,9 +293,9 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
 
       // Xt*(W[iofc]^T) = INPUTt * W[iofc]^T + At-1 * WA[iofc]
       ComputeGemm(batch_size_, hidden_size_x4, attention_size_, T{1.0},
-                  attention.cbegin(), attention.cend(),  // At-1
+                  attention.begin(), attention.end(),  // At-1
                   attention_size_,
-                  input_weights.cbegin() + input_size_, input_weights.cend(),  // WA[iofc]
+                  input_weights.begin() + input_size_, input_weights.end(),  // WA[iofc]
                   input_size_ + attention_size_, T{1.0},
                   step_out_IOFC, output_iofc_.end(),  // input contains Xt*(W[iofc]^T)
                   hidden_size_x4, ttp_);
@@ -304,7 +304,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
       ComputeGemm(batch_size_, hidden_size_x4, hidden_size_, T{1.0},
                   previous_state, previous_state_end,  // Ht-1
                   hidden_size_,
-                  recurrent_weights.cbegin(), recurrent_weights.cend(),  // R[iofc]
+                  recurrent_weights.begin(), recurrent_weights.end(),  // R[iofc]
                   hidden_size_, T{1.0},
                   step_out_IOFC, output_iofc_.end(),  // input contains Xt*(W[iofc]^T)
                   hidden_size_x4, ttp_);
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.h b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.h
index 0b157e5c4b..aad0774891 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.h
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.h
@@ -11,7 +11,7 @@
 #include "core/common/logging/logging.h"
 #include "core/framework/allocator.h"
 
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -69,7 +69,7 @@ class UniDirectionalAttnLstm {
   }
 
  private:
-  using span_T_const_iter = typename gsl::span<T>::const_iterator;
+  using span_T_const_iter = typename gsl::span<const T>::iterator;
   using span_T_iter = typename gsl::span<T>::iterator;
 
   void SetNumThreads();
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_helper.h b/onnxruntime/contrib_ops/cpu/bert/attention_helper.h
index 20f3472cb8..6659825e00 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_helper.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_helper.h
@@ -101,7 +101,7 @@ void PrepareMask(const int32_t* mask_index,
   bool is_raw_attention_mask = (nullptr != mask_index && mask_index_dims.size() == 2);
   bool has_mask_start_position = (nullptr != mask_index &&
                                   mask_index_dims.size() == 1 &&
-                                  static_cast<int>(mask_index_dims.at(0)) == 2 * batch_size);
+                                  static_cast<int>(mask_index_dims[0]) == 2 * batch_size);
 
   for (int b_i = 0; b_i < batch_size; b_i++) {
     // TODO: mask_index can be used in softmax to save some calculation.
diff --git a/onnxruntime/contrib_ops/cpu/bert/bifurcation_detector.h b/onnxruntime/contrib_ops/cpu/bert/bifurcation_detector.h
index 6f939a4fea..367bc31afd 100644
--- a/onnxruntime/contrib_ops/cpu/bert/bifurcation_detector.h
+++ b/onnxruntime/contrib_ops/cpu/bert/bifurcation_detector.h
@@ -27,8 +27,8 @@ class BifurcationDetector : public OpKernel {
     const Tensor* pred_tokens = context->Input<Tensor>(3);
     const auto* src_tokens_data = static_cast<const int64_t*>(src_tokens->DataRaw());
     const auto* cur_tokens_data = static_cast<const int64_t*>(cur_tokens->DataRaw());
-    int64_t src_tokens_len = src_tokens->Shape().GetDims().at(0);
-    int64_t cur_tokens_len = cur_tokens->Shape().GetDims().at(0);
+    int64_t src_tokens_len = src_tokens->Shape().GetDims()[0];
+    int64_t cur_tokens_len = cur_tokens->Shape().GetDims()[0];
 
     Tensor* out_tokens = nullptr;
 
@@ -45,7 +45,7 @@ class BifurcationDetector : public OpKernel {
     } else {
       const auto* pred_tokens_data = static_cast<const int64_t*>(pred_tokens->DataRaw());
       const int64_t prev_suffix_match_idx_data = static_cast<const int64_t*>(prev_suffix_match_idx->DataRaw())[0];
-      int64_t pred_tokens_len = pred_tokens->Shape().GetDims().at(0);
+      int64_t pred_tokens_len = pred_tokens->Shape().GetDims()[0];
       // Find bifurcation index between prediction tokens, and source tokens
       // starting from previous suffix match index.
       ORT_ENFORCE(src_tokens_len >= prev_suffix_match_idx_data);
@@ -70,7 +70,7 @@ class BifurcationDetector : public OpKernel {
     // Return the index of the start of the n-gram in source tokens.
     // No matching if found if src tokens contain multiple or zero matching n-grams.
     // Return -1.
-    int64_t tokens_len = out_tokens->Shape().GetDims().at(0);
+    int64_t tokens_len = out_tokens->Shape().GetDims()[0];
     int64_t min_gram = min_ngram_size_;
     int64_t max_gram = max_ngram_size_;
     int64_t suffix_idx = -1;
diff --git a/onnxruntime/contrib_ops/cpu/bert/ngram_repeat_block.h b/onnxruntime/contrib_ops/cpu/bert/ngram_repeat_block.h
index 2041f0d0c1..aa4354fca3 100644
--- a/onnxruntime/contrib_ops/cpu/bert/ngram_repeat_block.h
+++ b/onnxruntime/contrib_ops/cpu/bert/ngram_repeat_block.h
@@ -3,8 +3,9 @@
 
 #pragma once
 
-#include <core/common/safeint.h>
 #include "core/common/common.h"
+#include "core/common/narrow.h"
+#include "core/common/safeint.h"
 #include "core/framework/op_kernel.h"
 #include "core/platform/threadpool.h"
 
@@ -36,7 +37,7 @@ class NGramRepeatBlock : public OpKernel {
     int64_t cur_len = input_ids_dims[1];
     ORT_ENFORCE(scores_dims[0] == batch_size);
     int64_t vocab_size = scores_dims[1];
-    
+
     if (cur_len + 1 < ngram_size_) {
       return Status::OK();
     }
@@ -69,7 +70,7 @@ class NGramRepeatBlock : public OpKernel {
 
     concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
     concurrency::ThreadPool::TryParallelFor(
-        tp, gsl::narrow<std::ptrdiff_t>(batch_size) , static_cast<double>(cur_len * ngram_size_),
+        tp, narrow<std::ptrdiff_t>(batch_size), static_cast<double>(cur_len * ngram_size_),
         [&lambda](ptrdiff_t first, ptrdiff_t last) {
           for (auto b = static_cast<int64_t>(first), end = static_cast<int64_t>(last); b < end; ++b) {
             lambda(b);
diff --git a/onnxruntime/contrib_ops/cpu/crop.h b/onnxruntime/contrib_ops/cpu/crop.h
index 7892d1287b..77dce3dfaf 100644
--- a/onnxruntime/contrib_ops/cpu/crop.h
+++ b/onnxruntime/contrib_ops/cpu/crop.h
@@ -6,7 +6,7 @@
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
 namespace contrib {
diff --git a/onnxruntime/contrib_ops/cpu/image_scaler.h b/onnxruntime/contrib_ops/cpu/image_scaler.h
index 9fae72182f..21381e3fc2 100644
--- a/onnxruntime/contrib_ops/cpu/image_scaler.h
+++ b/onnxruntime/contrib_ops/cpu/image_scaler.h
@@ -3,8 +3,9 @@
 
 #pragma once
 
-#include <core/common/safeint.h>
 #include "core/common/common.h"
+#include "core/common/narrow.h"
+#include "core/common/safeint.h"
 #include "core/framework/op_kernel.h"
 #include "core/util/math_cpuonly.h"
 
@@ -44,7 +45,7 @@ class ImageScaler final : public OpKernel {
     EigenArrayMap<T> Y_arr(Y->MutableData<T>(), SafeInt<size_t>(H) * W, SafeInt<size_t>(N) * C);
 
     for (int64_t nc = 0; nc < N * C; ++nc) {
-      Y_arr.col(gsl::narrow<size_t>(nc)) = scale_ * X_arr.col(gsl::narrow<size_t>(nc)) + bias_[gsl::narrow<size_t>(nc % C)];
+      Y_arr.col(narrow<size_t>(nc)) = scale_ * X_arr.col(narrow<size_t>(nc)) + bias_[narrow<size_t>(nc % C)];
     }
     return Status::OK();
   }
diff --git a/onnxruntime/contrib_ops/cpu/inverse.cc b/onnxruntime/contrib_ops/cpu/inverse.cc
index 094eb8f207..355b036e36 100644
--- a/onnxruntime/contrib_ops/cpu/inverse.cc
+++ b/onnxruntime/contrib_ops/cpu/inverse.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/common/common.h"
+#include "core/common/narrow.h"
 #include "core/framework/op_kernel.h"
 #include "core/platform/threadpool.h"
 #include "core/util/math_cpuonly.h"
@@ -41,8 +42,8 @@ struct Inverse::ComputeImpl {
     const auto* input_data = input->Data<T>() + batch_offset;
     auto* output_data = output->MutableData<T>() + batch_offset;
 
-    Eigen::Map<const MatrixT<T>> input_matrix(input_data, gsl::narrow<size_t>(rows), gsl::narrow<size_t>(cols));
-    Eigen::Map<MatrixT<T>> output_matrix(output_data, gsl::narrow<size_t>(rows), gsl::narrow<size_t>(cols));
+    Eigen::Map<const MatrixT<T>> input_matrix(input_data, narrow<size_t>(rows), narrow<size_t>(cols));
+    Eigen::Map<MatrixT<T>> output_matrix(output_data, narrow<size_t>(rows), narrow<size_t>(cols));
     output_matrix = input_matrix.inverse();
   }
 };
@@ -56,8 +57,8 @@ struct Inverse::ComputeImpl<MLFloat16> {
     const auto* input_data = reinterpret_cast<const Eigen::half*>(input->Data<MLFloat16>() + batch_offset);
     auto* output_data = reinterpret_cast<Eigen::half*>(output->MutableData<MLFloat16>() + batch_offset);
 
-    Eigen::Map<const MatrixT<Eigen::half>> input_matrix(input_data, gsl::narrow<size_t>(rows), gsl::narrow<size_t>(cols));
-    Eigen::Map<MatrixT<Eigen::half>> output_matrix(output_data, gsl::narrow<size_t>(rows), gsl::narrow<size_t>(cols));
+    Eigen::Map<const MatrixT<Eigen::half>> input_matrix(input_data, narrow<size_t>(rows), narrow<size_t>(cols));
+    Eigen::Map<MatrixT<Eigen::half>> output_matrix(output_data, narrow<size_t>(rows), narrow<size_t>(cols));
     output_matrix = input_matrix.inverse();
   }
 };
@@ -81,7 +82,7 @@ Status Inverse::Compute(OpKernelContext* ctx) const {
     t_disp.Invoke<ComputeImpl>(input, output, batch_num, rows, cols);
   };
 
-  concurrency::ThreadPool::TryBatchParallelFor(ctx->GetOperatorThreadPool(), gsl::narrow<size_t>(num_batches), std::move(fn), 0);
+  concurrency::ThreadPool::TryBatchParallelFor(ctx->GetOperatorThreadPool(), narrow<size_t>(num_batches), std::move(fn), 0);
 
   return Status::OK();
 }
diff --git a/onnxruntime/contrib_ops/cpu/math/sparse_dense_matmul.cc b/onnxruntime/contrib_ops/cpu/math/sparse_dense_matmul.cc
index c80f0491fe..14d5ca8463 100644
--- a/onnxruntime/contrib_ops/cpu/math/sparse_dense_matmul.cc
+++ b/onnxruntime/contrib_ops/cpu/math/sparse_dense_matmul.cc
@@ -4,6 +4,7 @@
 #if !defined(DISABLE_SPARSE_TENSORS)
 
 #include "core/framework/sparse_tensor.h"
+#include "core/common/narrow.h"
 #include "core/providers/cpu/math/gemm_matmul_common.h"
 #include "core/providers/cpu/math/matmul_helper.h"
 #include "core/util/math.h"
@@ -120,9 +121,9 @@ struct SparseToDenseCoo {
     auto coo_view = A.AsCoo();
     const auto& ind_dims = coo_view.Indices().Shape().GetDims();
     ORT_RETURN_IF_NOT(ind_dims.size() == 2, "COO indices must be 2-D, got: ", ind_dims.size());
-    ConstEigenMatrixMapRowMajor<int64_t> a_indicies_map(coo_view.Indices().Data<int64_t>(), gsl::narrow<size_t>(ind_dims[0]), gsl::narrow<size_t>(ind_dims[1]));
-    ConstEigenMatrixMapRowMajor<T> map_b(B.Data<T>(), gsl::narrow<size_t>(b_dims[0]), gsl::narrow<size_t>(b_dims[1]));
-    EigenMatrixMapRowMajor<T> output_map(output.MutableData<T>(), gsl::narrow<size_t>(out_dims[0]), gsl::narrow<size_t>(out_dims[1]));
+    ConstEigenMatrixMapRowMajor<int64_t> a_indicies_map(coo_view.Indices().Data<int64_t>(), narrow<size_t>(ind_dims[0]), narrow<size_t>(ind_dims[1]));
+    ConstEigenMatrixMapRowMajor<T> map_b(B.Data<T>(), narrow<size_t>(b_dims[0]), narrow<size_t>(b_dims[1]));
+    EigenMatrixMapRowMajor<T> output_map(output.MutableData<T>(), narrow<size_t>(out_dims[0]), narrow<size_t>(out_dims[1]));
     output_map.setZero();
 
     const auto rhs_right = (ctx.trans_B) ? b_dims[0] : b_dims[1];
@@ -139,8 +140,8 @@ struct SparseToDenseCoo {
       ORT_RETURN_IF_NOT(m < out_left, "COO m index: ", m, " is out of bounds of out_left: ", out_left);
       const T a_value = a_values[i];
       for (int64_t n = 0; n < rhs_right; ++n) {
-        const T b_value = (ctx.trans_B) ? map_b(gsl::narrow<size_t>(n), gsl::narrow<size_t>(k)) : map_b(gsl::narrow<size_t>(k), gsl::narrow<size_t>(n));
-        output_map(gsl::narrow<size_t>(m), gsl::narrow<size_t>(n)) += Mul(a_value, ctx.alpha, b_value);
+        const T b_value = (ctx.trans_B) ? map_b(narrow<size_t>(n), narrow<size_t>(k)) : map_b(narrow<size_t>(k), narrow<size_t>(n));
+        output_map(narrow<size_t>(m), narrow<size_t>(n)) += Mul(a_value, ctx.alpha, b_value);
       }
     }
 
diff --git a/onnxruntime/contrib_ops/cpu/maxpool_with_mask.h b/onnxruntime/contrib_ops/cpu/maxpool_with_mask.h
index 1503234990..7210a9a7c6 100644
--- a/onnxruntime/contrib_ops/cpu/maxpool_with_mask.h
+++ b/onnxruntime/contrib_ops/cpu/maxpool_with_mask.h
@@ -7,6 +7,7 @@
 
 #pragma once
 #include "core/common/common.h"
+#include "core/common/narrow.h"
 #include "core/framework/op_kernel.h"
 #include "core/framework/tensor.h"
 #include "core/providers/cpu/nn/pool_base.h"
@@ -229,7 +230,7 @@ class MaxpoolWithMask : public OpKernel, public PoolBase {
         int64_t y_step = pooled_height;
         const int64_t total_channels = x_shape[0] * channels;
         const int64_t total_mask_channels = m_shape[0] * m_shape[1];
-        RunMaxpoolLoop<MaxpoolWithMask1DTask<float>>(tp, gsl::narrow<size_t>(total_channels),
+        RunMaxpoolLoop<MaxpoolWithMask1DTask<float>>(tp, narrow<size_t>(total_channels),
                                                      {X_data, M_data, Y_data, x_step, y_step, pooled_height, stride_h(),
                                                       height, total_mask_channels, kernel_shape, pads});
         break;
@@ -241,7 +242,7 @@ class MaxpoolWithMask : public OpKernel, public PoolBase {
         const int64_t total_channels = x_shape[0] * channels;
         const int64_t total_mask_channels = m_shape[0] * m_shape[1];
         RunMaxpoolLoop<MaxpoolWithMask2DTask<float>>(
-            tp, gsl::narrow<size_t>(total_channels),
+            tp, narrow<size_t>(total_channels),
             {X_data, M_data, Y_data, x_step, y_step, pooled_height, pooled_width, stride_h(), stride_w(), height, width,
              total_mask_channels, kernel_shape, pads});
         break;
@@ -252,7 +253,7 @@ class MaxpoolWithMask : public OpKernel, public PoolBase {
         const int64_t total_channels = x_shape[0] * channels;
         const int64_t total_mask_channels = m_shape[0] * m_shape[1];
         RunMaxpoolLoop<MaxpoolWithMask3DTask<float>>(
-            tp, gsl::narrow<size_t>(total_channels),
+            tp, narrow<size_t>(total_channels),
             {X_data, M_data, Y_data, x_step, y_step, pooled_height, pooled_width, pooled_depth, stride_h(), stride_w(),
              stride_d(), height, width, depth, total_mask_channels, kernel_shape, pads});
         break;
diff --git a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
index 626a91d3c5..7c4ee548cc 100644
--- a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
+++ b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
@@ -1,8 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <core/common/safeint.h>
 #include "nchwc_ops.h"
+#include "core/common/narrow.h"
+#include "core/common/safeint.h"
 #include "core/mlas/inc/mlas.h"
 
 namespace onnxruntime {
@@ -54,7 +55,7 @@ Status ReorderInput::Compute(OpKernelContext* context) const {
     // elements, so that operations involving a smaller number of channels will
     // process more rows per worker.
     constexpr ptrdiff_t worker_goal = 48 * 1024;
-    ptrdiff_t work_per_worker = std::max<ptrdiff_t>(worker_goal /  gsl::narrow<ptrdiff_t>(nchwc_channels), 1);
+    ptrdiff_t work_per_worker = std::max<ptrdiff_t>(worker_goal / narrow<ptrdiff_t>(nchwc_channels), 1);
     worker_count = std::max<ptrdiff_t>(total_work / work_per_worker, 1);
   } else {
     // Each iteration produces one spatial_size chunk of NCHWc blocks.
@@ -258,27 +259,27 @@ std::vector<float> NchwcUpsample::ComputeInterpolation(int64_t input_length,
                                                        int64_t output_length,
                                                        int64_t scale) const {
   std::vector<float> interpolation;
-  interpolation.resize(gsl::narrow<size_t>(output_length));
+  interpolation.resize(narrow<size_t>(output_length));
 
   if (scale == 1) {
     // Identity map for unscaled.
     for (int64_t o = 0; o < output_length; o++) {
-      interpolation[gsl::narrow<size_t>(o)] = static_cast<float>(o);
+      interpolation[narrow<size_t>(o)] = static_cast<float>(o);
     }
   } else if (transformation_mode_ == TransformationMode::ALIGN_CORNERS) {
     for (int64_t o = 0; o < output_length; o++) {
-      interpolation[gsl::narrow<size_t>(o)] =
+      interpolation[narrow<size_t>(o)] =
           static_cast<float>(o) * static_cast<float>(input_length - 1) / static_cast<float>(output_length - 1);
     }
   } else if (transformation_mode_ == TransformationMode::HALF_PIXEL) {
     for (int64_t o = 0; o < output_length; o++) {
-      interpolation[gsl::narrow<size_t>(o)] =
+      interpolation[narrow<size_t>(o)] =
           std::max(0.0f, (static_cast<float>(o) + 0.5f) / static_cast<float>(scale) - 0.5f);
     }
   } else {
     // Default to TransformationMode::ASYMMETRIC.
     for (int64_t o = 0; o < output_length; o++) {
-      interpolation[gsl::narrow<size_t>(o)] = static_cast<float>(o) / static_cast<float>(scale);
+      interpolation[narrow<size_t>(o)] = static_cast<float>(o) / static_cast<float>(scale);
     }
   }
 
@@ -353,7 +354,7 @@ Status NchwcUpsample::Compute(OpKernelContext* context) const {
               static_cast<size_t>(input_h),
               static_cast<size_t>(input_w),
               static_cast<size_t>(output_w),
-              interpolation_h[gsl::narrow<size_t>(row_index)],
+              interpolation_h[narrow<size_t>(row_index)],
               interpolation_w.data(),
               x_channel_base,
               y_row);
diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc
index 7d118c0b22..43a80ba097 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc
@@ -1,3 +1,4 @@
+#include "core/common/narrow.h"
 #include "core/providers/cpu/rnn/lstm_base.h"
 #include "core/providers/cpu/rnn/rnn_helpers.h"
 #include "core/providers/cpu/rnn/uni_directional_lstm.h"
@@ -188,8 +189,8 @@ Status DynamicQuantizeLSTM::Compute(OpKernelContext* context) const {
   ZeroPointCheck(w_zp, W_zp_shape, is_W_signed, Input);
   ZeroPointCheck(r_zp, R_zp_shape, is_R_signed, Recurrent);
 
-  size_t W_scale_size = W_scale_shape.NumDimensions() == 2 ? gsl::narrow<size_t>(W_scale_shape[1]) : 1;
-  size_t R_scale_size = R_scale_shape.NumDimensions() == 2 ? gsl::narrow<size_t>(R_scale_shape[1]) : 1;
+  size_t W_scale_size = W_scale_shape.NumDimensions() == 2 ? narrow<size_t>(W_scale_shape[1]) : 1;
+  size_t R_scale_size = R_scale_shape.NumDimensions() == 2 ? narrow<size_t>(R_scale_shape[1]) : 1;
 
   QuantizationParameter quant_para_W_1(w_scale->Data<float>(),
                                        static_cast<const uint8_t*>(w_zp->DataRaw()),
diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
index 24dd79d8c8..11d3e40a84 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/common/narrow.h"
 #include "core/common/safeint.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/providers/cpu/math/element_wise_ops.h"
@@ -102,7 +103,7 @@ Status MatMulIntegerToFloatBase::ComputeCommon(OpKernelContext* ctx,
     const float* b_scale_tensor_data = b_scale_tensor->Data<float>();
 
     if (is_b_scale_per_column) {
-      multipliers_per_column.reserve(gsl::narrow<size_t>(b_scale_tensor->Shape().Size()));
+      multipliers_per_column.reserve(narrow<size_t>(b_scale_tensor->Shape().Size()));
       std::transform(b_scale_tensor_data,
                      b_scale_tensor_data + b_scale_tensor->Shape().Size(),
                      std::back_inserter(multipliers_per_column),
@@ -217,7 +218,7 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const {
   uint8_t* a_data_quant = static_cast<uint8_t*>(allocator->Alloc(SafeInt<size_t>(num_of_elements) * sizeof(uint8_t)));
   BufferUniquePtr a_buffer_quant_holder(a_data_quant, BufferDeleter(std::move(allocator)));
 
-  ParQuantizeLinear(a_data, a_data_quant, gsl::narrow<size_t>(num_of_elements), a_scale, a_zero_point, ctx->GetOperatorThreadPool());
+  ParQuantizeLinear(a_data, a_data_quant, narrow<size_t>(num_of_elements), a_scale, a_zero_point, ctx->GetOperatorThreadPool());
 
   bool is_b_scale_supported = IsBQuantParamSupported(b_scale_tensor->Shape(), b ? b->Shape() : b_shape_);
   ORT_RETURN_IF_ERROR(ComputeCommon(
diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_activations.cc b/onnxruntime/contrib_ops/cpu/quantization/qlinear_activations.cc
index 9706fcd881..a5ffeaa0d1 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_activations.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_activations.cc
@@ -4,6 +4,7 @@
 #include "qlinear_activations.h"
 #include "qlinear_lookup_table.h"
 
+#include "core/common/narrow.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/platform/threadpool.h"
 
@@ -53,7 +54,7 @@ Status QLinearLookupBase<T>::ComputeBase(OpKernelContext* context, Transformer f
   const uint8_t* x_data = reinterpret_cast<const uint8_t*>(X.Data<T>());
   uint8_t* y_data = reinterpret_cast<uint8_t*>(Y.MutableData<T>());
   ThreadPool::TryParallelFor(
-      tp, gsl::narrow<std::ptrdiff_t>(N), TensorOpCost{1.0, 1.0, 1.0},
+      tp, narrow<std::ptrdiff_t>(N), TensorOpCost{1.0, 1.0, 1.0},
       [this, x_data, y_data, &table](std::ptrdiff_t first, std::ptrdiff_t last) {
         QLinearLookupTableTransform(
             x_data + first,
diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_concat.cc b/onnxruntime/contrib_ops/cpu/quantization/qlinear_concat.cc
index 8e639ea994..0e868ae62a 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_concat.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_concat.cc
@@ -4,6 +4,7 @@
 #include "qlinear_concat.h"
 #include "qlinear_lookup_table.h"
 
+#include "core/common/narrow.h"
 #include "core/providers/common.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/platform/threadpool.h"
@@ -158,9 +159,9 @@ Status QLinearConcat::Compute(OpKernelContext* ctx) const {
     uint8_t* output = static_cast<uint8_t*>(p.output_tensor->MutableDataRaw()) + initial_output_offset;
     for (int64_t cur_in_offset = 0; cur_in_offset < prep.num_elements; cur_in_offset += input_axis_pitch) {
       if (is_copy) {
-        memcpy(output, input + cur_in_offset, gsl::narrow<size_t>(input_axis_pitch));
+        memcpy(output, input + cur_in_offset, narrow<size_t>(input_axis_pitch));
       } else {
-        QLinearLookupTableTransform(input + cur_in_offset, table, output, gsl::narrow<size_t>(input_axis_pitch));
+        QLinearLookupTableTransform(input + cur_in_offset, table, output, narrow<size_t>(input_axis_pitch));
       }
       output += p.output_axis_pitch;
     }
diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc b/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc
index a7eaa4c804..7eab698693 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "qlinear_global_average_pool.h"
+#include "core/common/narrow.h"
 #include "core/util/math_cpuonly.h"
 #include "core/providers/common.h"
 #include "core/platform/threadpool.h"
@@ -32,7 +33,7 @@ Status ComputeQLinearGlobalAvgPool(
       const T8Bits* input = (const T8Bits*)(x + (first * image_size));
       T8Bits* output = (T8Bits*)(y + first);
       std::vector<int32_t> acc_buffer(MlasQLinearSafePaddingElementCount(sizeof(int32_t), last - first));
-      MlasQLinearGlobalAveragePoolNchw(input, x_scale, x_zero_point, output, y_scale, y_zero_point, last - first, gsl::narrow<size_t>(image_size), acc_buffer.data());
+      MlasQLinearGlobalAveragePoolNchw(input, x_scale, x_zero_point, output, y_scale, y_zero_point, last - first, narrow<size_t>(image_size), acc_buffer.data());
     };
     concurrency::ThreadPool::TryParallelFor(
         tp, static_cast<std::ptrdiff_t>(N * C), {1.0 * image_size, 1.0, 8.0 * image_size}, worker);
@@ -40,11 +41,11 @@ Status ComputeQLinearGlobalAvgPool(
     auto worker = [=](std::ptrdiff_t first, std::ptrdiff_t last) {
       const T8Bits* input = x + first * C * image_size;
       T8Bits* output = y + first * C;
-      std::vector<int32_t> acc_buffer(MlasQLinearSafePaddingElementCount(sizeof(int32_t), gsl::narrow<size_t>(C)));
-      std::vector<T8Bits> zero_buffer(MlasQLinearSafePaddingElementCount(sizeof(T8Bits), gsl::narrow<size_t>(C)), 0);
+      std::vector<int32_t> acc_buffer(MlasQLinearSafePaddingElementCount(sizeof(int32_t), narrow<size_t>(C)));
+      std::vector<T8Bits> zero_buffer(MlasQLinearSafePaddingElementCount(sizeof(T8Bits), narrow<size_t>(C)), 0);
       MlasQLinearGlobalAveragePoolNhwc(
           input, x_scale, x_zero_point, output, y_scale, y_zero_point,
-          last - first, gsl::narrow<size_t>(image_size), gsl::narrow<size_t>(C), gsl::narrow<size_t>(C), acc_buffer.data(), zero_buffer.data());
+          last - first, narrow<size_t>(image_size), narrow<size_t>(C), narrow<size_t>(C), acc_buffer.data(), zero_buffer.data());
     };
     concurrency::ThreadPool::TryParallelFor(
         tp, static_cast<std::ptrdiff_t>(N),
@@ -79,11 +80,11 @@ Status QLinearGlobalAveragePool::Compute(OpKernelContext* context) const {
 
   int64_t N = x_shape[0];
   int64_t C = (channels_last_ ? x_shape.back() : x_shape[1]);
-  int64_t image_size = std::accumulate(x_shape.cbegin() + spatial_dim_start, x_shape.cbegin() + spatial_dim_end,
+  int64_t image_size = std::accumulate(x_shape.begin() + spatial_dim_start, x_shape.begin() + spatial_dim_end,
                                        1LL, std::multiplies<int64_t>());
 
   std::vector<int64_t> output_dims(x_shape.begin(), x_shape.end());
-  std::transform(x_shape.cbegin() + spatial_dim_start, x_shape.cbegin() + spatial_dim_end,
+  std::transform(x_shape.begin() + spatial_dim_start, x_shape.begin() + spatial_dim_end,
                  output_dims.begin() + spatial_dim_start, [](const int64_t&) { return int64_t{1}; });
   Tensor& Y = *context->Output(0, output_dims);
 
diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.cc b/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.cc
index 281f6d0550..7524f3ecce 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_softmax.cc
@@ -15,7 +15,7 @@
 
 #include "core/mlas/inc/mlas.h"
 #include "core/platform/threadpool.h"
-#include "gsl/gsl-lite.hpp"
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
 namespace contrib {
diff --git a/onnxruntime/contrib_ops/cpu/tokenizer.cc b/onnxruntime/contrib_ops/cpu/tokenizer.cc
index 16c635cfd7..45998b6d83 100644
--- a/onnxruntime/contrib_ops/cpu/tokenizer.cc
+++ b/onnxruntime/contrib_ops/cpu/tokenizer.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/common/common.h"
+#include "core/common/narrow.h"
 #include "core/common/utf8_util.h"
 #include "core/framework/tensor.h"
 #include "core/framework/op_kernel.h"
@@ -473,10 +474,10 @@ Status Tokenizer::Compute(OpKernelContext* ctx) const {
   size_t C = 0;
   if (input_dims.size() == 1) {
     N = 1;
-    C = gsl::narrow<size_t>(input_dims[0]);
+    C = narrow<size_t>(input_dims[0]);
   } else if (input_dims.size() == 2) {
-    N = gsl::narrow<size_t>(input_dims[0]);
-    C = gsl::narrow<size_t>(input_dims[1]);
+    N = narrow<size_t>(input_dims[0]);
+    C = narrow<size_t>(input_dims[1]);
   } else {
     return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
                   "Input dimensions are either [C] or [N][C] allowed");
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
index 3329f2377f..d4132675ed 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
@@ -26,7 +26,7 @@
 #include "core/framework/TensorSeq.h"
 #include "core/framework/allocator.h"
 #include "core/framework/ort_value.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/beam_search.h"
 #include "contrib_ops/cpu/transformers/logits_processor.h"
 #include "contrib_ops/cpu/transformers/sequences.h"
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
index 7bd8853c40..50606dfc8f 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
@@ -5,6 +5,8 @@
 
 #include "contrib_ops/cpu/transformers/beam_search_impl_base.h"
 
+#include "core/common/span_utils.h"
+
 namespace onnxruntime {
 namespace contrib {
 
@@ -255,8 +257,8 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager& feeds_fetches_manage
       bool increase_position = (iteration_counter > 1);
       ORT_RETURN_IF_ERROR(UpdateFeeds(fetches, feeds, current_length,
                                       position_ids, increase_position,
-                                      beam_next_tokens.as_span<const int32_t>(),
-                                      beam_indices.as_span<const int32_t>()));
+                                      ReinterpretAsSpan<const int32_t>(beam_next_tokens),
+                                      ReinterpretAsSpan<const int32_t>(beam_indices)));
     }
     fetches.clear();
   }
@@ -280,7 +282,7 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager& feeds_fetches_manage
   if (output_scores != nullptr) {
     gsl::span<float> target = output_scores->MutableDataAsSpan<float>();
     gsl::span<const float> source = gsl::span<const float>(beam_state.scores.data(), beam_state.scores.size());
-    assert(target.length() == source.length());
+    assert(target.size() == source.size());
     ORT_RETURN_IF_ERROR(this->device_copy_func_(target, source, nullptr, DeviceCopyDirection::deviceToDevice));
   }
 
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
index 304bb64c1b..e370d7bcb9 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include "core/common/span_utils.h"
 #include "contrib_ops/cpu/transformers/generation_shared.h"  // for DEBUG_GENERATION
 #include "contrib_ops/cpu/transformers/beam_search_impl_base.h"
 #include "contrib_ops/cpu/transformers/subgraph_t5_encoder.h"
@@ -214,7 +215,7 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
                                                 cpu_state,
                                                 iteration_counter));
     ++current_length;  // Increase sequence length after a new token is generated.
-    ORT_RETURN_IF_ERROR(decoder_subgraph_.CreateInitialFeeds(beam_next_tokens.as_span<const int32_t>(),
+    ORT_RETURN_IF_ERROR(decoder_subgraph_.CreateInitialFeeds(ReinterpretAsSpan<const int32_t>(beam_next_tokens),
                                                              this->implicit_inputs_,
                                                              encoder_feeds,
                                                              encoder_fetches,
@@ -284,8 +285,8 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
           decoder_fetches,
           decoder_feeds,
           num_present_outputs,
-          beam_next_tokens.as_span<const int32_t>(),
-          beam_indices.as_span<const int32_t>(),
+          ReinterpretAsSpan<const int32_t>(beam_next_tokens),
+          ReinterpretAsSpan<const int32_t>(beam_indices),
           parameters->num_beams,
           decoder_subgraph_.GetFirstPastInputIndex(),
           decoder_subgraph_.GetFirstPresentOutputIndex(),
@@ -316,7 +317,7 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
   if (output_scores != nullptr) {
     gsl::span<float> target = output_scores->MutableDataAsSpan<float>();
     gsl::span<const float> source = gsl::span<const float>(beam_state.scores.data(), beam_state.scores.size());
-    assert(target.length() == source.length());
+    assert(target.size() == source.size());
     ORT_RETURN_IF_ERROR(this->device_copy_func_(target, source, nullptr, DeviceCopyDirection::deviceToDevice));
   }
 
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
index 81ebb74830..e26d4245c8 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
@@ -5,6 +5,7 @@
 #include <math.h>
 #include "core/common/common.h"
 #include "core/common/safeint.h"
+#include "core/common/span_utils.h"
 #include "core/framework/allocator.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
@@ -188,7 +189,7 @@ void BeamSearchScorer::Process(ISequences* sequences,
         auto clone = hypothesis_buffer_.subspan(hypothesis_buffer_offset_, sequence_length);
         gsl::copy(src, clone);
         hypothesis_buffer_offset_ += static_cast<size_t>(sequence_length);
-        auto sequence = clone.template as_span<const int32_t>();
+        auto sequence = ReinterpretAsSpan<const int32_t>(clone);
         beam_hyp.Add(sequence, next_score);
       } else {
         // Add next predicted token since it is not eos_token.
@@ -209,7 +210,7 @@ void BeamSearchScorer::Process(ISequences* sequences,
     //  Check if we are done so that we can save a pad step if all(done)
     if (!done_[batch]) {
       gsl::span<const float> topk_scores = next_scores.subspan(batch * num_beams_, top_k);
-      const float* best_sum_logprobs = std::max_element(topk_scores.begin(), topk_scores.end());
+      const auto best_sum_logprobs = std::max_element(topk_scores.begin(), topk_scores.end());
       if (beam_hyp.IsDone(*best_sum_logprobs, sequence_length)) {
         done_[batch] = true;
       }
diff --git a/onnxruntime/contrib_ops/cpu/transformers/generate_impl_base.h b/onnxruntime/contrib_ops/cpu/transformers/generate_impl_base.h
index 3964d50c6d..751d5cce18 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/generate_impl_base.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/generate_impl_base.h
@@ -6,6 +6,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "core/common/span_utils.h"
 #include "contrib_ops/cpu/transformers/generation_shared.h"
 
 namespace onnxruntime {
@@ -142,7 +143,7 @@ class GenerateBase {
         return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                                "Input 'attention_mask' is expected to have 2 dimensions, got ", dims_attn.size());
       }
-      if (dims_attn != dims) {
+      if (!SpanEq(dims_attn, dims)) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                                "Input 'attention_mask' is expected to have same shape as input_ids");
       }
diff --git a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc
index a6b0e0465a..20ba41ea85 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc
@@ -7,7 +7,7 @@
 #include "core/providers/cpu/math/top_k.h"
 #include "core/providers/cpu/math/softmax_shared.h"
 #include "core/common/safeint.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/sequences.h"
 #include "contrib_ops/cpu/transformers/beam_search_scorer.h"
 #include "contrib_ops/cpu/transformers/generation_device_helper.h"
@@ -526,7 +526,7 @@ void PickGptPastState(const std::vector<OrtValue>& last_outputs,
 
     gsl::span<T> past_span = gsl::make_span<T>(past.GetMutable<Tensor>()->MutableData<T>(), past_shape.Size());
     gsl::span<const T> present_span = gsl::make_span<const T>(present.Get<Tensor>().Data<T>(), past_shape.Size());
-    for (gsl::index j = 0; j < beam_indices.length(); j++) {
+    for (size_t j = 0; j < beam_indices.size(); j++) {
       int32_t beam_index = beam_indices[j];
       gsl::span<const T> present_key = present_span.subspan(beam_index * block_size_per_beam, block_size_per_beam);
       gsl::span<const T> present_value = present_span.subspan(past_key_size + beam_index * block_size_per_beam,
@@ -563,7 +563,7 @@ Status UpdateGptFeeds(
   // The following updates inputs for subgraph
 
   // Update input_ids with next tokens.
-  int batch_beam_size = static_cast<int>(beam_next_tokens.length());
+  int batch_beam_size = static_cast<int>(beam_next_tokens.size());
   int64_t dims[] = {batch_beam_size, 1};
   TensorShape input_ids_shape(&dims[0], 2);
   auto int32_type = DataTypeImpl::GetType<int32_t>();
@@ -712,7 +712,7 @@ void PickT5PastState(const std::vector<OrtValue>& last_outputs,
 
     gsl::span<T> past_span = gsl::make_span<T>(past.GetMutable<Tensor>()->MutableData<T>(), past_shape.Size());
     gsl::span<const T> present_span = gsl::make_span<const T>(present.Get<Tensor>().Data<T>(), past_shape.Size());
-    for (gsl::index j = 0; j < beam_indices.length(); j++) {
+    for (size_t j = 0; j < beam_indices.size(); j++) {
       int32_t beam_index = beam_indices[j];
       gsl::span<const T> present_beam = present_span.subspan(beam_index * block_size_per_beam, block_size_per_beam);
       gsl::span<T> past_beam = past_span.subspan(j * block_size_per_beam, block_size_per_beam);
@@ -750,7 +750,7 @@ Status UpdateDecoderFeeds(
   // Only need copy beam next tokens to input_ids, and copy present_*_self_* to past_*_self_*,
 
   // Update input_ids with next tokens.
-  int batch_beam_size = static_cast<int>(beam_next_tokens.length());
+  int batch_beam_size = static_cast<int>(beam_next_tokens.size());
 
   // TODO(tianleiwu): Reuse buffer for input_ids to reduce memory allocation.
   OrtValue input_ids;
diff --git a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h
index 1cb3234823..ac64dc9cf5 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h
@@ -10,7 +10,7 @@
 #endif
 
 #include <vector>
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/logits_processor.h"
 #include "contrib_ops/cpu/transformers/generation_shared.h"
 
diff --git a/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h b/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h
index 1f1b63485c..a5794e5f4e 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include <utility>
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "core/framework/allocator.h"
 #include "core/framework/ort_value.h"
 
diff --git a/onnxruntime/contrib_ops/cpu/transformers/greedy_search.cc b/onnxruntime/contrib_ops/cpu/transformers/greedy_search.cc
index 088c59fac2..0299912cab 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/greedy_search.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/greedy_search.cc
@@ -25,7 +25,7 @@
 #include "core/framework/session_options.h"
 #include "core/framework/TensorSeq.h"
 #include "core/framework/ort_value.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/greedy_search.h"
 #include "contrib_ops/cpu/transformers/logits_processor.h"
 #include "contrib_ops/cpu/transformers/sequences.h"
diff --git a/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h
index 17936e1647..cbbcf29672 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h
@@ -4,6 +4,8 @@
 #pragma once
 #include <algorithm>
 #include <vector>
+
+#include "core/common/span_utils.h"
 #include "contrib_ops/cpu/transformers/greedy_search_impl_base.h"
 
 namespace onnxruntime {
@@ -219,7 +221,7 @@ Status GreedySearchGpt<T>::Execute(const FeedsFetchesManager& feeds_fetches_mana
       bool increase_position = (iteration_counter > 1);
       ORT_RETURN_IF_ERROR(UpdateFeeds(fetches, feeds, current_length,
                                       position_ids, increase_position,
-                                      next_tokens.as_span<const int32_t>()));
+                                      ReinterpretAsSpan<const int32_t>(next_tokens)));
     }
     fetches.clear();
   }
diff --git a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
index df2a2215c2..2f1e657c8e 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
@@ -3,7 +3,9 @@
 
 #include <memory>
 #include <assert.h>
+#include "core/common/narrow.h"
 #include "core/common/safeint.h"
+#include "core/common/span_utils.h"
 #include "contrib_ops/cpu/transformers/logits_processor.h"
 #include "contrib_ops/cpu/transformers/dump_tensor.h"
 
@@ -100,15 +102,15 @@ void NoRepeatNGramLogitsProcessor<T>::Process(const ISequences* sequences,
     gsl::span<T> beam_token_scores = next_token_scores.GetScores(i);
     gsl::span<const int32_t> sequence = sequences->GetSequence(i);
 
-    gsl::span<const int32_t> prefix = sequence.subspan(sequence.length() - prefix_length);
-    ORT_ENFORCE(prefix.length() == prefix_length);
+    gsl::span<const int32_t> prefix = sequence.subspan(sequence.size() - prefix_length);
+    ORT_ENFORCE(prefix.size() == narrow<size_t>(prefix_length));
 
     std::unordered_set<int32_t> blocked_word_ids;
-    for (int j = 0; j <= static_cast<int>(sequence.length()) - ngram_size_; j++) {
+    for (int j = 0; j <= static_cast<int>(sequence.size()) - ngram_size_; j++) {
       // Here we use naive algorithm for matching. The complexity is O(batch_beam_size * ngram_size * sequence_length)
       // TODO(tianleiwu): build N-Gram index (hash table with prefix of length NGram - 1 as key,
       //                  and list of last word of NGram as value) for fast matching.
-      if (ngram_size_ == 1 || prefix == sequence.subspan(j, prefix_length)) {
+      if (ngram_size_ == 1 || SpanEq(prefix, sequence.subspan(j, prefix_length))) {
         blocked_word_ids.insert(sequence[static_cast<gsl::index>(j) + prefix_length]);
       }
     }
diff --git a/onnxruntime/contrib_ops/cpu/transformers/sequences.cc b/onnxruntime/contrib_ops/cpu/transformers/sequences.cc
index a69d486532..a42aa47bfd 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/sequences.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/sequences.cc
@@ -10,7 +10,7 @@ namespace transformers {
 
 void Sequences::Init(gsl::span<int32_t> buffer, int batch_beam_size, int sequence_length, int max_length) {
   size_t sequences_size = SafeInt<size_t>(batch_beam_size) * max_length;
-  assert(buffer.length() == sequences_size + sequences_size);
+  assert(buffer.size() == sequences_size + sequences_size);
 
   sequences[0] = buffer.subspan(0, sequences_size);
   sequences[1] = buffer.subspan(sequences_size);
diff --git a/onnxruntime/contrib_ops/cpu/transformers/sequences.h b/onnxruntime/contrib_ops/cpu/transformers/sequences.h
index e7271d677e..cd7b714028 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/sequences.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/sequences.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/generation_shared.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
index d56745c19f..3d98039fe8 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
@@ -7,7 +7,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
 #include "core/providers/cpu/tensor/utils.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/subgraph_base.h"
 #include "contrib_ops/cpu/transformers/dump_tensor.h"
 
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h
index 93c133a85d..05b661ac33 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h
@@ -5,7 +5,7 @@
 
 #include <vector>
 #include <string>
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "core/framework/allocator.h"
 #include "core/framework/feeds_fetches_manager.h"
 #include "contrib_ops/cpu/transformers/generation_device_helper.h"
@@ -48,7 +48,7 @@ class Subgraph {
   Status Setup(const SessionState& session_state,
                const SessionState& subgraph_session_state);
 
-  FeedsFetchesManager* GetFeedsFetchesManager() { 
+  FeedsFetchesManager* GetFeedsFetchesManager() {
     return (feeds_fetches_manager_.has_value()) ? &*feeds_fetches_manager_ : nullptr;
   }
 
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc
index 1b4cf34d6d..33d1fe3ba7 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc
@@ -6,7 +6,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
 #include "core/providers/cpu/tensor/utils.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/subgraph_gpt.h"
 #include "contrib_ops/cpu/transformers/dump_tensor.h"
 
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
index 6702a1ab6b..9c48766121 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
@@ -6,7 +6,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
 #include "core/providers/cpu/tensor/utils.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/subgraph_t5_decoder.h"
 #include "contrib_ops/cpu/transformers/dump_tensor.h"
 #include "contrib_ops/cpu/transformers/generation_device_helper.h"
@@ -139,7 +139,7 @@ Status T5DecoderSubgraph::CreateInitialFeeds(
   AllocatorPtr allocator = session_state_->GetAllocator(encoder_feeds[0].Get<Tensor>().Location());
 
   // Copy beam next tokens in CPU to input_ids in provider device (CPU for CPU EP, or GPU for CUDA EP).
-  int batch_beam_size = static_cast<int>(beam_next_tokens.length());
+  int batch_beam_size = static_cast<int>(beam_next_tokens.size());
   int64_t dims[] = {batch_beam_size, 1};
   TensorShape input_ids_shape(&dims[0], 2);
   OrtValue input_ids;
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.cc
index c919efe77f..4ae39c39eb 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.cc
@@ -6,7 +6,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
 #include "core/providers/cpu/tensor/utils.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/subgraph_t5_encoder.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
index a578597410..a69c417d77 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
@@ -254,7 +254,7 @@ Status QkvToContext(
   } else if (nullptr != mask_index) {  // 1d mask index
     ORT_ENFORCE(mask_index_dims.size() == 1);
     // mask_index has 1D shape: either (batch_size) or (2*batch_size). Only the later one has start postions.
-    const int* mask_start = (mask_index_dims.at(0) > batch_size) ? mask_index + batch_size : nullptr;
+    const int* mask_start = (mask_index_dims[0] > batch_size) ? mask_index + batch_size : nullptr;
     ORT_RETURN_IF_ERROR(ComputeSoftmaxWithMask1D<T>(
         stream, total_sequence_length, sequence_length, batch_size, num_heads,
         mask_index, mask_start, data.extra_add_qk, scratch1, scratch2, parameters.is_unidirectional));
diff --git a/onnxruntime/contrib_ops/cuda/math/bias_dropout.h b/onnxruntime/contrib_ops/cuda/math/bias_dropout.h
index a28d6cd804..ef905595ce 100644
--- a/onnxruntime/contrib_ops/cuda/math/bias_dropout.h
+++ b/onnxruntime/contrib_ops/cuda/math/bias_dropout.h
@@ -3,7 +3,6 @@
 
 #pragma once
 
-#include "gsl/gsl"
 #include "core/providers/cuda/cuda_kernel.h"
 #include "core/providers/cuda/cuda_common.h"
 #include "core/framework/random_generator.h"
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
index ccb1b017a5..6f5e59bbcc 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
@@ -394,7 +394,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //
 
   gsl::span<const float> next_scores = gsl::make_span(
       cpu_state->topk_scores.data(),
-      static_cast<typename gsl::span<float>::index_type>(topk_scores->Shape().Size()));
+      static_cast<typename gsl::span<float>::size_type>(topk_scores->Shape().Size()));
   gsl::span<const int32_t> next_tokens(cpu_state->topk_tokens.data(), beam_state->next_tokens.size());
   gsl::span<const int32_t> next_indices(cpu_state->topk_indices.data(), beam_state->next_indices.size());
 
@@ -579,7 +579,7 @@ Status PickGptPastState(const std::vector<OrtValue>& last_outputs,
 
     gsl::span<T> past_span = gsl::make_span<T>(past.GetMutable<Tensor>()->MutableData<T>(), past_shape.Size());
     gsl::span<const T> present_span = gsl::make_span<const T>(present.Get<Tensor>().Data<T>(), past_shape.Size());
-    for (gsl::index j = 0; j < beam_indices.length(); j++) {
+    for (size_t j = 0; j < beam_indices.size(); j++) {
       int32_t beam_index = beam_indices[j];
       gsl::span<const T> present_key = present_span.subspan(beam_index * block_size_per_beam, block_size_per_beam);
       gsl::span<const T> present_value = present_span.subspan(past_key_size + beam_index * block_size_per_beam,
@@ -623,7 +623,7 @@ Status PickT5PastState(const std::vector<OrtValue>& last_outputs,
 
     gsl::span<T> past_span = gsl::make_span<T>(past.GetMutable<Tensor>()->MutableData<T>(), past_shape.Size());
     gsl::span<const T> present_span = gsl::make_span<const T>(present.Get<Tensor>().Data<T>(), past_shape.Size());
-    for (gsl::index j = 0; j < beam_indices.length(); j++) {
+    for (size_t j = 0; j < beam_indices.size(); j++) {
       int32_t beam_index = beam_indices[j];
       gsl::span<const T> present_beam = present_span.subspan(beam_index * block_size_per_beam, block_size_per_beam);
       gsl::span<T> past_beam = past_span.subspan(j * block_size_per_beam, block_size_per_beam);
@@ -652,7 +652,7 @@ Status UpdateGptFeeds(
     int gpt_subgraph_first_past_input_idx,
     int gpt_subgraph_first_present_output_idx) {
   // Update input_ids with next tokens.
-  int batch_beam_size = static_cast<int>(beam_next_tokens.length());
+  int batch_beam_size = static_cast<int>(beam_next_tokens.size());
   int64_t dims[] = {batch_beam_size, 1};
   TensorShape input_ids_shape(&dims[0], 2);
   auto element_type = DataTypeImpl::GetType<int32_t>();
@@ -732,7 +732,7 @@ Status UpdateDecoderFeeds(
   ORT_UNUSED_PARAMETER(current_length);
 
   // Update input_ids with next tokens.
-  int batch_beam_size = static_cast<int>(beam_next_tokens.length());
+  int batch_beam_size = static_cast<int>(beam_next_tokens.size());
   int64_t dims[] = {batch_beam_size, 1};
   TensorShape input_ids_shape(&dims[0], 2);
   auto element_type = DataTypeImpl::GetType<int32_t>();
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.h b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.h
index 1f97060b94..e0e5890d0b 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.h
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.h
@@ -7,7 +7,7 @@
 #include "core/providers/cpu/tensor/utils.h"
 #include "core/providers/cuda/cuda_common.h"
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "contrib_ops/cpu/transformers/generation_shared.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/contrib_ops/rocm/bert/attention_impl.cu b/onnxruntime/contrib_ops/rocm/bert/attention_impl.cu
index 315fa7acd2..af34728ff7 100644
--- a/onnxruntime/contrib_ops/rocm/bert/attention_impl.cu
+++ b/onnxruntime/contrib_ops/rocm/bert/attention_impl.cu
@@ -154,7 +154,7 @@ Status QkvToContext(
   // apply softmax and store result P to scratch2: BxNxSxS*
   if (use_raw_attention_mask) {  // 2d, 3d or 4d attention mask
     const int mask_dimension = static_cast<int>(mask_index_dims.size());
-    const int max_sequence_length = mask_dimension == 4 ? static_cast<int>(mask_index_dims.at(3)) : 0;
+    const int max_sequence_length = mask_dimension == 4 ? static_cast<int>(mask_index_dims[3]) : 0;
 
     T* persistent_softmax_workspace = scratch1;  // replace Q*K' in place if persistent softmax is selected.
     ORT_RETURN_IF_ERROR(
@@ -165,7 +165,7 @@ Status QkvToContext(
   } else if (nullptr != mask_index) {  // 1d mask index
     ORT_ENFORCE(mask_index_dims.size() == 1);
     // mask_index has 1D shape: either (batch_size) or (2*batch_size). Only the later one has start postions.
-    const int* mask_start = (mask_index_dims.at(0) > batch_size) ? mask_index + batch_size : nullptr;
+    const int* mask_start = (mask_index_dims[0] > batch_size) ? mask_index + batch_size : nullptr;
     ORT_RETURN_IF_ERROR(ComputeSoftmaxWithMask1D<T>(stream, all_sequence_length, sequence_length, batch_size, num_heads,
                                      mask_index, mask_start, extra_add_qk, scratch1, scratch2, is_unidirectional));
   } else {  // no mask
diff --git a/onnxruntime/core/codegen/mti/mti_tvm_utils.h b/onnxruntime/core/codegen/mti/mti_tvm_utils.h
index c2a14106c1..e85489dc1c 100644
--- a/onnxruntime/core/codegen/mti/mti_tvm_utils.h
+++ b/onnxruntime/core/codegen/mti/mti_tvm_utils.h
@@ -5,7 +5,7 @@
 
 #include <string>
 #include <vector>
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 #include <tvm/tvm.h>
 #include "core/codegen/mti/common.h"
 
diff --git a/onnxruntime/core/codegen/mti/tensor/concat_ops.cc b/onnxruntime/core/codegen/mti/tensor/concat_ops.cc
index e7223d93b7..de78514b48 100644
--- a/onnxruntime/core/codegen/mti/tensor/concat_ops.cc
+++ b/onnxruntime/core/codegen/mti/tensor/concat_ops.cc
@@ -4,7 +4,7 @@
 #include "core/codegen/mti/tensor/concat_ops.h"
 
 #include "core/codegen/mti/mti_tvm_utils.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include <topi/transform.h>
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/codegen/mti/tensor/gather.cc b/onnxruntime/core/codegen/mti/tensor/gather.cc
index 6748b5913a..3ea6ebf466 100644
--- a/onnxruntime/core/codegen/mti/tensor/gather.cc
+++ b/onnxruntime/core/codegen/mti/tensor/gather.cc
@@ -4,7 +4,7 @@
 #include "core/codegen/mti/tensor/gather.h"
 
 #include "core/codegen/mti/mti_tvm_utils.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include <topi/transform.h>
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/codegen/mti/tensor/slice.cc b/onnxruntime/core/codegen/mti/tensor/slice.cc
index 6cbab43584..7c73be2b52 100644
--- a/onnxruntime/core/codegen/mti/tensor/slice.cc
+++ b/onnxruntime/core/codegen/mti/tensor/slice.cc
@@ -5,7 +5,7 @@
 
 #include "core/codegen/mti/mti_tvm_utils.h"
 #include <climits>
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 #include <topi/transform.h>
 #include <tvm/ir_pass.h>
 
diff --git a/onnxruntime/core/codegen/mti/tensor/split.cc b/onnxruntime/core/codegen/mti/tensor/split.cc
index c8c11293bf..8dbbd8fdcc 100644
--- a/onnxruntime/core/codegen/mti/tensor/split.cc
+++ b/onnxruntime/core/codegen/mti/tensor/split.cc
@@ -4,7 +4,7 @@
 #include "core/codegen/mti/tensor/split.h"
 
 #include "core/codegen/mti/mti_tvm_utils.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include <topi/transform.h>
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/codegen/mti/tensor/tile.cc b/onnxruntime/core/codegen/mti/tensor/tile.cc
index 1c7dafe36c..60ef29f7ce 100644
--- a/onnxruntime/core/codegen/mti/tensor/tile.cc
+++ b/onnxruntime/core/codegen/mti/tensor/tile.cc
@@ -3,7 +3,7 @@
 
 #include "core/codegen/mti/tensor/tile.h"
 #include "core/codegen/mti/mti_tvm_utils.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
 namespace tvm_codegen {
diff --git a/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.cc b/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.cc
index a1b9135ff4..cdce2dec9b 100644
--- a/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.cc
+++ b/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.cc
@@ -7,7 +7,7 @@
 #include "core/codegen/passes/utils/codegen_context.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/common.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include <topi/detail/extern.h>
 
diff --git a/onnxruntime/core/common/logging/capture.cc b/onnxruntime/core/common/logging/capture.cc
index cdc8ff7ec1..3c23e15e5c 100644
--- a/onnxruntime/core/common/logging/capture.cc
+++ b/onnxruntime/core/common/logging/capture.cc
@@ -3,7 +3,7 @@
 
 #include "core/common/logging/capture.h"
 #include "core/common/logging/logging.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
 namespace logging {
@@ -43,7 +43,7 @@ void Capture::ProcessPrintf(msvc_printf_check const char* format, va_list args)
   const int nbrcharacters = vsnprintf(message.data(), message.size(), format, args);
 #endif
   error = nbrcharacters < 0;
-  truncated = (nbrcharacters >= 0 && static_cast<gsl::index>(nbrcharacters) > message.size());
+  truncated = (nbrcharacters >= 0 && static_cast<size_t>(nbrcharacters) > message.size());
 #endif
 
   if (error) {
diff --git a/onnxruntime/core/flatbuffers/flatbuffers_utils.cc b/onnxruntime/core/flatbuffers/flatbuffers_utils.cc
index 505b79548a..fbfcfed4b7 100644
--- a/onnxruntime/core/flatbuffers/flatbuffers_utils.cc
+++ b/onnxruntime/core/flatbuffers/flatbuffers_utils.cc
@@ -3,9 +3,8 @@
 
 #include "core/flatbuffers/flatbuffers_utils.h"
 
-#include "gsl/gsl"
-
 #include "core/common/common.h"
+#include "core/common/gsl.h"
 #include "core/flatbuffers/schema/ort.fbs.h"
 #include "core/graph/constants.h"
 #include "core/graph/onnx_protobuf.h"
diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc
index 58c57839c0..0a2edf6588 100644
--- a/onnxruntime/core/framework/allocation_planner.cc
+++ b/onnxruntime/core/framework/allocation_planner.cc
@@ -605,10 +605,10 @@ class PlannerImpl {
         UseCount(name)++;
 
         bool is_graph_input = (graph_inputs.find(name) != graph_inputs.cend());
-        bool is_outer_scope_arg = std::find_if(outer_scope_node_args_.cbegin(), outer_scope_node_args_.cend(),
+        bool is_outer_scope_arg = std::find_if(outer_scope_node_args_.begin(), outer_scope_node_args_.end(),
                                                [&name](const NodeArg* value) {
                                                  return value && value->Name() == name;
-                                               }) != outer_scope_node_args_.cend();
+                                               }) != outer_scope_node_args_.end();
         bool is_subgraph = (parent_node_ != nullptr);
 
         // If it's a graph input or outer scope node arg, set its plan.
diff --git a/onnxruntime/core/framework/allocatormgr.cc b/onnxruntime/core/framework/allocatormgr.cc
index edd884310d..cc3826e50c 100644
--- a/onnxruntime/core/framework/allocatormgr.cc
+++ b/onnxruntime/core/framework/allocatormgr.cc
@@ -2,12 +2,15 @@
 // Licensed under the MIT License.
 
 #include "core/framework/allocatormgr.h"
-#include "core/framework/bfc_arena.h"
-#include "core/common/logging/logging.h"
+
+#include <limits>
 #include <mutex>
 #include <sstream>
 #include <unordered_map>
-#include <limits>
+
+#include "core/common/logging/logging.h"
+#include "core/common/narrow.h"
+#include "core/framework/bfc_arena.h"
 
 namespace onnxruntime {
 using namespace common;
@@ -15,9 +18,9 @@ using namespace common;
 namespace {
 int32_t MakeKey(OrtMemType mem_type, OrtDevice device) {
   // shorten device id so we can fit everything
-  uint8_t short_device = gsl::narrow<uint8_t>(device.Id());
+  uint8_t short_device = narrow<uint8_t>(device.Id());
   // and convert mem_type. OrtMemType weirdly uses -2 as the first value so we offset by that before narrowing
-  uint8_t ort_mem_type = gsl::narrow<uint8_t>(mem_type + 2);
+  uint8_t ort_mem_type = narrow<uint8_t>(mem_type + 2);
 
   // NOTE: OrtMemType is the type of memory for a kernel's input/output
   //       OrtDevice.MemType is the device memory type.
diff --git a/onnxruntime/core/framework/data_transfer_utils.h b/onnxruntime/core/framework/data_transfer_utils.h
index 43bf74eb7f..d54df49eeb 100644
--- a/onnxruntime/core/framework/data_transfer_utils.h
+++ b/onnxruntime/core/framework/data_transfer_utils.h
@@ -5,7 +5,7 @@
 
 #include <type_traits>
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "core/common/common.h"
 #include "core/framework/tensor.h"
diff --git a/onnxruntime/core/framework/endian_utils.h b/onnxruntime/core/framework/endian_utils.h
index d44ce641c5..b83977c1ac 100644
--- a/onnxruntime/core/framework/endian_utils.h
+++ b/onnxruntime/core/framework/endian_utils.h
@@ -5,7 +5,7 @@
 
 #include <type_traits>
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "core/common/status.h"
 #include "core/common/common.h"
diff --git a/onnxruntime/core/framework/error_code.cc b/onnxruntime/core/framework/error_code.cc
index d1d509fbe0..cc681c82cf 100644
--- a/onnxruntime/core/framework/error_code.cc
+++ b/onnxruntime/core/framework/error_code.cc
@@ -1,7 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/gsl_suppress.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "core/session/ort_apis.h"
 #include "core/common/status.h"
diff --git a/onnxruntime/core/framework/fallback_cpu_capability.h b/onnxruntime/core/framework/fallback_cpu_capability.h
index b6015cdd57..afc518d28a 100644
--- a/onnxruntime/core/framework/fallback_cpu_capability.h
+++ b/onnxruntime/core/framework/fallback_cpu_capability.h
@@ -3,8 +3,7 @@
 
 #pragma once
 
-#include <gsl/gsl>
-
+#include "core/common/gsl.h"
 #include "core/common/inlined_containers_fwd.h"
 #include "core/framework/execution_provider.h"  // for IExecutionProvider::IKernelLookup
 #include "core/graph/graph_viewer.h"
diff --git a/onnxruntime/core/framework/kernel_def_builder.cc b/onnxruntime/core/framework/kernel_def_builder.cc
index 463e8b583a..5d0d070afa 100644
--- a/onnxruntime/core/framework/kernel_def_builder.cc
+++ b/onnxruntime/core/framework/kernel_def_builder.cc
@@ -7,8 +7,6 @@
 #include <unordered_set>
 #include <string>
 
-#include "gsl/gsl"
-
 namespace onnxruntime {
 namespace {
 
diff --git a/onnxruntime/core/framework/kernel_lookup.h b/onnxruntime/core/framework/kernel_lookup.h
index 78e5c7dfe2..2b4d3ce816 100644
--- a/onnxruntime/core/framework/kernel_lookup.h
+++ b/onnxruntime/core/framework/kernel_lookup.h
@@ -3,9 +3,8 @@
 
 #pragma once
 
-#include "gsl/gsl"
-
 #include "core/common/common.h"
+#include "core/common/gsl.h"
 #include "core/framework/execution_provider.h"  // for IExecutionProvider::IKernelLookup
 #include "core/framework/kernel_registry.h"
 #include "core/framework/kernel_type_str_resolver.h"
diff --git a/onnxruntime/core/framework/kernel_registry_manager.h b/onnxruntime/core/framework/kernel_registry_manager.h
index 344ab220e9..1868583f41 100644
--- a/onnxruntime/core/framework/kernel_registry_manager.h
+++ b/onnxruntime/core/framework/kernel_registry_manager.h
@@ -7,8 +7,7 @@
 #include <variant>
 #include <unordered_map>
 
-#include "gsl/gsl"
-
+#include "core/common/gsl.h"
 #include "core/common/inlined_containers.h"
 #include "core/common/status.h"
 #include "core/framework/kernel_type_str_resolver.h"
diff --git a/onnxruntime/core/framework/kernel_type_str_resolver.h b/onnxruntime/core/framework/kernel_type_str_resolver.h
index cd94de1146..6655b52f41 100644
--- a/onnxruntime/core/framework/kernel_type_str_resolver.h
+++ b/onnxruntime/core/framework/kernel_type_str_resolver.h
@@ -7,12 +7,11 @@
 #include <string_view>
 #include <utility>
 
-#include "gsl/gsl"
-
 #if !defined(ORT_MINIMAL_BUILD)
 #include "onnx/defs/schema.h"
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
+#include "core/common/gsl.h"
 #include "core/common/inlined_containers.h"
 #include "core/common/status.h"
 #include "core/graph/op_identifier.h"
diff --git a/onnxruntime/core/framework/kernel_type_str_resolver_utils.h b/onnxruntime/core/framework/kernel_type_str_resolver_utils.h
index b9535c31f1..3d06013e4f 100644
--- a/onnxruntime/core/framework/kernel_type_str_resolver_utils.h
+++ b/onnxruntime/core/framework/kernel_type_str_resolver_utils.h
@@ -5,8 +5,7 @@
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
-#include "gsl/gsl"
-
+#include "core/common/gsl.h"
 #include "core/common/status.h"
 #include "core/framework/kernel_type_str_resolver.h"
 #include "core/graph/op_identifier.h"
diff --git a/onnxruntime/core/framework/math.h b/onnxruntime/core/framework/math.h
index 15fc5f0a9a..b1ff24612c 100644
--- a/onnxruntime/core/framework/math.h
+++ b/onnxruntime/core/framework/math.h
@@ -3,8 +3,7 @@
 
 #pragma once
 
-#include <gsl/gsl>
-
+#include "core/common/narrow.h"
 #include "core/framework/tensor.h"
 #include "core/util/math_cpuonly.h"
 
@@ -12,12 +11,12 @@ namespace onnxruntime {
 
 template <typename T>
 auto EigenMap(Tensor& t) -> EigenVectorMap<T> {
-  return EigenVectorMap<T>(t.MutableData<T>(), gsl::narrow<ptrdiff_t>(t.Shape().Size()));
+  return EigenVectorMap<T>(t.MutableData<T>(), narrow<ptrdiff_t>(t.Shape().Size()));
 }
 
 template <typename T>
 auto EigenMap(const Tensor& t) -> ConstEigenVectorMap<T> {
-  return ConstEigenVectorMap<T>(t.Data<T>(), gsl::narrow<ptrdiff_t>(t.Shape().Size()));
+  return ConstEigenVectorMap<T>(t.Data<T>(), narrow<ptrdiff_t>(t.Shape().Size()));
 }
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/onnxruntime_typeinfo.h b/onnxruntime/core/framework/onnxruntime_typeinfo.h
index 67e270f5be..5b9145d32e 100644
--- a/onnxruntime/core/framework/onnxruntime_typeinfo.h
+++ b/onnxruntime/core/framework/onnxruntime_typeinfo.h
@@ -4,7 +4,6 @@
 #pragma once
 #include <atomic>
 #include <string>
-#include "core/common/gsl_suppress.h"
 #include "core/session/onnxruntime_c_api.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/framework/op_node_proto_helper.cc b/onnxruntime/core/framework/op_node_proto_helper.cc
index 0872d36b1f..38d67eb0e0 100644
--- a/onnxruntime/core/framework/op_node_proto_helper.cc
+++ b/onnxruntime/core/framework/op_node_proto_helper.cc
@@ -6,7 +6,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/graph/op.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 using namespace ONNX_NAMESPACE;
 using namespace ::onnxruntime::common;
diff --git a/onnxruntime/core/framework/random_seed.cc b/onnxruntime/core/framework/random_seed.cc
index dc6084c294..9166e56ec5 100644
--- a/onnxruntime/core/framework/random_seed.cc
+++ b/onnxruntime/core/framework/random_seed.cc
@@ -3,14 +3,13 @@
 
 #include "random_seed.h"
 #include "random_generator.h"
-#include "core/common/gsl_suppress.h"
 #include <atomic>
 #include <chrono>
 
 namespace onnxruntime {
 namespace utils {
 
-// "Global initializer calls a non-constexpr function." 
+// "Global initializer calls a non-constexpr function."
 //TODO: Fix the warning. The variable should be put in the environment class.
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(push)
diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
index 827cb3c257..8001fca7da 100644
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@@ -5,7 +5,7 @@
 
 #include <string>
 #include <vector>
-#include "core/common/gsl_suppress.h"
+#include "core/common/gsl.h"
 #include "core/common/inlined_containers.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "core/optimizer/graph_transformer_level.h"
diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h
index 4e153dc8c2..cacb206a33 100644
--- a/onnxruntime/core/framework/session_state.h
+++ b/onnxruntime/core/framework/session_state.h
@@ -8,7 +8,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "core/common/common.h"
 #include "core/common/inlined_containers.h"
diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc
index 0d0015e9bd..4f4b71a8ac 100644
--- a/onnxruntime/core/framework/session_state_utils.cc
+++ b/onnxruntime/core/framework/session_state_utils.cc
@@ -346,11 +346,11 @@ common::Status SaveInitializedTensors(
 template <typename T>  // T is container of const NodeArg* or NodeArg*
 static bool IsArgNameInInputsOutputs(const std::string& name,
                                      const T& graph_args) {
-  auto it = std::find_if(graph_args.cbegin(), graph_args.cend(),
+  auto it = std::find_if(graph_args.begin(), graph_args.end(),
                          [&name](const onnxruntime::NodeArg* arg) {
                            return arg->Name() == name;
                          });
-  return it != graph_args.cend();
+  return it != graph_args.end();
 }
 
 common::Status SaveInputOutputNamesToNodeMapping(const onnxruntime::GraphViewer& graph,
diff --git a/onnxruntime/core/framework/sparse_tensor.cc b/onnxruntime/core/framework/sparse_tensor.cc
index 83ada96c05..b19738b111 100644
--- a/onnxruntime/core/framework/sparse_tensor.cc
+++ b/onnxruntime/core/framework/sparse_tensor.cc
@@ -3,14 +3,15 @@
 
 #if !defined(DISABLE_SPARSE_TENSORS)
 
-#include "core/framework/data_types.h"
 #include "core/framework/sparse_tensor.h"
+
+#include "core/common/narrow.h"
+#include "core/common/safeint.h"
 #include "core/framework/data_transfer_manager.h"
+#include "core/framework/data_types.h"
 #include "core/framework/ort_value.h"
 #include "core/framework/utils.h"
 
-#include "core/common/safeint.h"
-
 using namespace onnxruntime::common;
 
 namespace onnxruntime {
@@ -48,7 +49,7 @@ inline std::vector<std::reference_wrapper<const Tensor>> MakeListConst(const T&.
 void CopyStrings(const Tensor& src_t, Tensor& dst_t) {
   auto src_span = src_t.DataAsSpan<std::string>();
   std::string* dst = dst_t.MutableData<std::string>();
-  std::copy(src_span.cbegin(), src_span.cend(), dst);
+  std::copy(src_span.begin(), src_span.end(), dst);
 }
 
 Status CopyData(const IDataTransfer* data_transfer,
@@ -224,7 +225,7 @@ Status SparseTensor::AllocateBuffer(int64_t buffer_size, size_t num_values) {
       // We own the buffer, so we must properly construct strings. Neither of the Tensors
       // we construct on top of the buffer own it. We are constructing empty strings, hopefully
       // nothrow and no buffer allocation
-      utils::ConstructStrings(data_ptr.get(), gsl::narrow<int64_t>(num_values));
+      utils::ConstructStrings(data_ptr.get(), narrow<int64_t>(num_values));
     }
     p_data_ = data_ptr.release();
   }
@@ -250,7 +251,7 @@ SparseTensor::CooView SparseTensor::AsCoo() const {
 }
 
 std::vector<int64_t> SparseTensor::GetCooIndexDims(size_t values_count, size_t index_size) const {
-  std::vector<int64_t> index_dims{gsl::narrow<int64_t>(values_count)};
+  std::vector<int64_t> index_dims{narrow<int64_t>(values_count)};
   if (values_count * 2 == index_size) {
     // 2-D COO index
     index_dims.push_back(2);
@@ -309,14 +310,14 @@ Status SparseTensor::MakeCooStrings(size_t string_count, const char* const* stri
 SparseTensor::CooMutator SparseTensor::MakeCooData(size_t values_count, size_t index_count) {
   ORT_ENFORCE(Format() == SparseFormat::kUndefined, "Sparse format must not be set. Already contains format: ", Format());
   ORT_ENFORCE(allocator_ != nullptr, "This method should follow a call to constructor that supplies the allocator");
-  const auto num_values = gsl::narrow<int64_t>(values_count);
+  const auto num_values = narrow<int64_t>(values_count);
   TensorShape values_shape{num_values};
   TensorShape index_shape(GetCooIndexDims(values_count, index_count));
   if (num_values > 0) {
     const auto data_size = SafeInt<size_t>(values_count) * ml_data_type_->Size();
     const auto index_size = SafeInt<size_t>(index_count) * sizeof(int64_t);
-    const auto required_buffer_size = CalculateRequiredBufferSize(gsl::narrow<int64_t>(data_size),
-                                                                  gsl::narrow<int64_t>(index_size));
+    const auto required_buffer_size = CalculateRequiredBufferSize(narrow<int64_t>(data_size),
+                                                                  narrow<int64_t>(index_size));
     ORT_THROW_IF_ERROR(AllocateBuffer(required_buffer_size, values_count));
   }
   values_ = Tensor(DataType(), values_shape, p_data_, Location());
@@ -407,12 +408,12 @@ SparseTensor::CsrMutator SparseTensor::MakeCsrData(size_t values_count,
   if (values_count > 0) {
     const auto data_size = SafeInt<size_t>(values_count) * ml_data_type_->Size();
     const auto index_size = (SafeInt<size_t>(inner_index_count) + outer_index_count) * sizeof(int64_t);
-    const auto required_buffer_size = CalculateRequiredBufferSize(gsl::narrow<int64_t>(data_size),
-                                                                  gsl::narrow<int64_t>(index_size));
+    const auto required_buffer_size = CalculateRequiredBufferSize(narrow<int64_t>(data_size),
+                                                                  narrow<int64_t>(index_size));
     ORT_THROW_IF_ERROR(AllocateBuffer(required_buffer_size, values_count));
   }
 
-  const auto num_values = gsl::narrow<int64_t>(values_count);
+  const auto num_values = narrow<int64_t>(values_count);
   values_ = Tensor(DataType(), {num_values}, p_data_, Location());
 
   auto* inner_index_start = reinterpret_cast<int64_t*>(IndicesStart(values_.SizeInBytes()));
@@ -478,7 +479,7 @@ Status SparseTensor::MakeBlockSparseStrings(const TensorShape& values_shape, con
                                             const TensorShape& indices_shape, const int32_t* indices_data) {
   ORT_RETURN_IF_NOT(IsDataTypeString(), "Expecting data type to be set as string");
   auto mutator = MakeBlockSparseData(values_shape, indices_shape);
-  auto string_count = gsl::narrow<size_t>(values_shape.Size());
+  auto string_count = narrow<size_t>(values_shape.Size());
   if (string_count > 0) {
     auto& dst_values = mutator.Values();
     auto& dst_indices = mutator.Indices();
@@ -495,8 +496,8 @@ SparseTensor::BlockSparseMutator SparseTensor::MakeBlockSparseData(const TensorS
   if (values_shape.Size() > 0) {
     const auto data_size = SafeInt<int64_t>(values_shape.Size()) * ml_data_type_->Size();
     const auto index_size = SafeInt<int64_t>(indices_shape.Size()) * sizeof(int32_t);
-    const auto required_buffer_size = CalculateRequiredBufferSize(gsl::narrow<int64_t>(data_size),
-                                                                  gsl::narrow<int64_t>(index_size));
+    const auto required_buffer_size = CalculateRequiredBufferSize(narrow<int64_t>(data_size),
+                                                                  narrow<int64_t>(index_size));
     ORT_THROW_IF_ERROR(AllocateBuffer(required_buffer_size, static_cast<size_t>(data_size / ml_data_type_->Size())));
   }
 
diff --git a/onnxruntime/core/framework/sparse_utils.cc b/onnxruntime/core/framework/sparse_utils.cc
index 2726d468d6..7f9eeee12a 100644
--- a/onnxruntime/core/framework/sparse_utils.cc
+++ b/onnxruntime/core/framework/sparse_utils.cc
@@ -4,6 +4,8 @@
 #if !defined(DISABLE_SPARSE_TENSORS)
 
 #include "core/framework/sparse_utils.h"
+
+#include "core/common/span_utils.h"
 #include "core/common/status.h"
 #include "core/framework/tensor.h"
 #include "core/framework/data_types_internal.h"
@@ -127,19 +129,19 @@ Status DenseTensorToSparseCsr(const DataTransferManager& data_manager, const Ten
       } break;
       case sizeof(uint16_t): {
         // MFFloat16 and BFloat16 are handled fine
-        auto span16 = src_span.as_span<const uint16_t>();
+        auto span16 = ReinterpretAsSpan<const uint16_t>(src_span);
         ScanAndRecordCsr(span16, cols, inner_indices, outer_indices, [&](uint16_t v) { values_16.push_back(v); });
         Tensor t(src.DataType(), {static_cast<int64_t>(values_16.size())}, values_16.data(), cpu_allocator->Info());
         nnz_tensor = std::move(t);
       } break;
       case sizeof(uint32_t): {
-        auto span32 = src_span.as_span<const uint32_t>();
+        auto span32 = ReinterpretAsSpan<const uint32_t>(src_span);
         ScanAndRecordCsr(span32, cols, inner_indices, outer_indices, [&](uint32_t v) { values_32.push_back(v); });
         Tensor t(src.DataType(), {static_cast<int64_t>(values_32.size())}, values_32.data(), cpu_allocator->Info());
         nnz_tensor = std::move(t);
       } break;
       case sizeof(uint64_t): {
-        auto span64 = src_span.as_span<const uint64_t>();
+        auto span64 = ReinterpretAsSpan<const uint64_t>(src_span);
         ScanAndRecordCsr(span64, cols, inner_indices, outer_indices, [&](uint64_t v) { values_64.push_back(v); });
         Tensor t(src.DataType(), {static_cast<int64_t>(values_64.size())}, values_64.data(), cpu_allocator->Info());
         nnz_tensor = std::move(t);
@@ -463,21 +465,21 @@ Status DenseTensorToSparseCoo(const DataTransferManager& data_manager, const Ten
       } break;
       case sizeof(uint16_t): {
         // MFFloat16 and BFloat16 are handled fine
-        auto span16 = src_span.as_span<const uint16_t>();
+        auto span16 = ReinterpretAsSpan<const uint16_t>(src_span);
         ScanAndRecordCoo(span16, cols, linear_index, gathered_indices, [&](int16_t v) { values_16.push_back(v); });
         Tensor t(src.DataType(), TensorShape{static_cast<int64_t>(values_16.size())},
                  values_16.data(), cpu_allocator->Info());
         nnz_tensor = std::move(t);
       } break;
       case sizeof(uint32_t): {
-        auto span32 = src_span.as_span<const uint32_t>();
+        auto span32 = ReinterpretAsSpan<const uint32_t>(src_span);
         ScanAndRecordCoo(span32, cols, linear_index, gathered_indices, [&](int32_t v) { values_32.push_back(v); });
         Tensor t(src.DataType(), TensorShape{static_cast<int64_t>(values_32.size())},
                  values_32.data(), cpu_allocator->Info());
         nnz_tensor = std::move(t);
       } break;
       case sizeof(uint64_t): {
-        auto span64 = src_span.as_span<const uint64_t>();
+        auto span64 = ReinterpretAsSpan<const uint64_t>(src_span);
         ScanAndRecordCoo(span64, cols, linear_index, gathered_indices, [&](int64_t v) { values_64.push_back(v); });
         Tensor t(src.DataType(), TensorShape{static_cast<int64_t>(values_64.size())},
                  values_64.data(), cpu_allocator->Info());
diff --git a/onnxruntime/core/framework/tensor_type_and_shape.cc b/onnxruntime/core/framework/tensor_type_and_shape.cc
index 5f29fabcec..bebfa72c54 100644
--- a/onnxruntime/core/framework/tensor_type_and_shape.cc
+++ b/onnxruntime/core/framework/tensor_type_and_shape.cc
@@ -7,6 +7,7 @@
 #include <atomic>
 #include <stdexcept>
 
+#include "core/common/narrow.h"
 #include "core/common/safeint.h"
 #include "core/framework/error_code_helper.h"
 #include "core/framework/ort_value.h"
@@ -24,6 +25,7 @@ using onnxruntime::MLFloat16;
 using onnxruntime::SparseTensor;
 #endif
 using onnxruntime::Tensor;
+using onnxruntime::narrow;
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(disable : 26409)
 #endif
@@ -292,7 +294,7 @@ ORT_API_STATUS_IMPL(OrtApis::GetSparseTensorIndices, _In_ const OrtValue* v,
   API_IMPL_BEGIN
 #if !defined(DISABLE_SPARSE_TENSORS)
   const Tensor& indices_tensor = GetIndicesTensor(*v, indices_format);
-  *num_indices = gsl::narrow<size_t>(indices_tensor.Shape().Size());
+  *num_indices = narrow<size_t>(indices_tensor.Shape().Size());
   *indices = indices_tensor.DataRaw();
   return nullptr;
 #else
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index 02a5dcb707..22faa67373 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -6,9 +6,11 @@
 #include <memory>
 #include <algorithm>
 #include <limits>
-#include <gsl/gsl>
 
+#include "core/common/gsl.h"
 #include "core/common/logging/logging.h"
+#include "core/common/narrow.h"
+#include "core/common/span_utils.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/framework/endian_utils.h"
 #include "core/framework/op_kernel.h"
@@ -619,7 +621,7 @@ Status GetExtDataFromTensorProto(const Env& env, const ORTCHAR_T* model_path,
 
     SafeInt<FileOffsetType> end_of_read(file_offset);
     end_of_read += raw_data_safe_len;
-    ORT_RETURN_IF(file_offset < 0 || end_of_read > gsl::narrow<FileOffsetType>(file_length),
+    ORT_RETURN_IF(file_offset < 0 || end_of_read > narrow<FileOffsetType>(file_length),
                   "External initializer: ", tensor_proto.name(),
                   " offset: ", file_offset, " size to read: ", static_cast<size_t>(raw_data_safe_len),
                   " given file_length: ", file_length, " are out of bounds or can not be read in full.");
@@ -897,7 +899,7 @@ static Status CopySparseData(size_t n_sparse_elements,
                              std::function<void(size_t from_idx, size_t to_idx)> copier) {
   Status status = Status::OK();
   TensorShape indices_shape(indices.dims().data(), indices.dims().size());
-  const auto elements = gsl::narrow<size_t>(indices_shape.Size());
+  const auto elements = narrow<size_t>(indices_shape.Size());
 
   std::vector<int64_t> indices_values;  // used for conversion of smaller size indices
   std::vector<uint8_t> unpack_buffer;
@@ -909,7 +911,7 @@ static Status CopySparseData(size_t n_sparse_elements,
         ORT_RETURN_IF_NOT(indices.raw_data().size() == (elements * sizeof(int64_t)),
                           "Sparse Indices raw data size does not match expected.");
         ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer));
-        indices_data = gsl::make_span(unpack_buffer).as_span<const int64_t>();
+        indices_data = ReinterpretAsSpan<const int64_t>(gsl::make_span(unpack_buffer));
       } else {
         ORT_RETURN_IF_NOT(indices.int64_data_size() == static_cast<int64_t>(elements), "Sparse indices int64 data size does not match expected");
         indices_data = gsl::make_span(indices.int64_data().data(), elements);
@@ -920,8 +922,8 @@ static Status CopySparseData(size_t n_sparse_elements,
         ORT_RETURN_IF_NOT(indices.raw_data().size() == (elements * sizeof(int32_t)),
                           "Sparse Indices raw data size does not match expected.");
         ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer));
-        auto int32_span = gsl::make_span(unpack_buffer).as_span<const int32_t>();
-        indices_values.insert(indices_values.cend(), int32_span.cbegin(), int32_span.cend());
+        auto int32_span = ReinterpretAsSpan<const int32_t>(gsl::make_span(unpack_buffer));
+        indices_values.insert(indices_values.cend(), int32_span.begin(), int32_span.end());
         unpack_buffer.clear();
         unpack_buffer.shrink_to_fit();
       } else {
@@ -936,8 +938,8 @@ static Status CopySparseData(size_t n_sparse_elements,
         ORT_RETURN_IF_NOT(indices.raw_data().size() == (elements * sizeof(int16_t)),
                           "Sparse Indices raw data size does not match expected.");
         ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer));
-        auto int16_span = gsl::make_span(unpack_buffer).as_span<const int16_t>();
-        indices_values.insert(indices_values.cend(), int16_span.cbegin(), int16_span.cend());
+        auto int16_span = ReinterpretAsSpan<const int16_t>(gsl::make_span(unpack_buffer));
+        indices_values.insert(indices_values.cend(), int16_span.begin(), int16_span.end());
         indices_data = gsl::make_span(indices_values);
         unpack_buffer.clear();
         unpack_buffer.shrink_to_fit();
@@ -952,8 +954,8 @@ static Status CopySparseData(size_t n_sparse_elements,
         ORT_RETURN_IF_NOT(indices.raw_data().size() == elements,
                           "Sparse Indices raw data size does not match expected.");
         ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer));
-        auto int8_span = gsl::make_span(unpack_buffer).as_span<const int8_t>();
-        indices_values.insert(indices_values.cend(), int8_span.cbegin(), int8_span.cend());
+        auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(unpack_buffer));
+        indices_values.insert(indices_values.cend(), int8_span.begin(), int8_span.end());
         indices_data = gsl::make_span(indices_values);
         unpack_buffer.clear();
         unpack_buffer.shrink_to_fit();
@@ -971,13 +973,13 @@ static Status CopySparseData(size_t n_sparse_elements,
   if (indices_shape.NumDimensions() == 1) {
     // flattened indexes
     for (size_t i = 0; i < n_sparse_elements; ++i) {
-      copier(i, gsl::narrow<size_t>(indices_data[i]));
+      copier(i, narrow<size_t>(indices_data[i]));
     }
   } else if (indices_shape.NumDimensions() == 2) {
     // entries in format {NNZ, rank}
     ORT_ENFORCE(indices_shape[1] > 0 && static_cast<size_t>(indices_shape[1]) == dims.size());
     auto rank = static_cast<size_t>(indices_shape[1]);
-    const int64_t* cur_index = indices_data.data();
+    auto cur_index = indices_data.begin();
     std::vector<size_t> multipliers;
     multipliers.resize(rank);
 
@@ -1001,7 +1003,7 @@ static Status CopySparseData(size_t n_sparse_elements,
       cur_index += rank;
     }
 
-    ORT_ENFORCE(cur_index == &*indices_data.cend());
+    ORT_ENFORCE(cur_index == indices_data.end());
   } else {
     status = ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "Invalid SparseTensor indices. Should be rank 0 or 1. Got:",
                              indices_shape);
diff --git a/onnxruntime/core/framework/transpose_helper.h b/onnxruntime/core/framework/transpose_helper.h
index 99e3dd9a5a..bb7a04d097 100644
--- a/onnxruntime/core/framework/transpose_helper.h
+++ b/onnxruntime/core/framework/transpose_helper.h
@@ -36,10 +36,10 @@ We fall back to the default implementation in all other cases, and if the input
 #include "core/framework/tensor_shape.h"
 #include "core/framework/tensor.h"
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
 bool IsTransposeMovingSingleAxis(gsl::span<const size_t> permutations, size_t& from, size_t& to);
 void SingleAxisTranspose(gsl::span<const size_t> permutations, const Tensor& input, Tensor& output, size_t from,
                          size_t to, const TensorShape* input_shape_override = nullptr);
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 2b73349649..cf8302872a 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -11,9 +11,10 @@
 #include <queue>
 
 #include "core/common/common.h"
-#include "gsl/gsl"
-#include "core/common/logging/logging.h"
+#include "core/common/gsl.h"
 #include "core/common/inlined_containers.h"
+#include "core/common/logging/logging.h"
+#include "core/common/narrow.h"
 #include "core/flatbuffers/flatbuffers_utils.h"
 #include "core/flatbuffers/schema/ort.fbs.h"
 #include "core/framework/tensor_shape.h"
@@ -690,7 +691,7 @@ Status Node::SaveToOrtFormat(flatbuffers::FlatBufferBuilder& builder,
   nb.add_doc_string(doc_string);
   nb.add_domain(domain);
   nb.add_since_version(since_version_);
-  nb.add_index(gsl::narrow<uint32_t>(index_));
+  nb.add_index(narrow<uint32_t>(index_));
   nb.add_op_type(op_type);
   nb.add_type(static_cast<fbs::NodeType>(node_type_));
   nb.add_execution_provider_type(ep);
@@ -708,7 +709,7 @@ flatbuffers::Offset<fbs::NodeEdge> Node::SaveEdgesToOrtFormat(flatbuffers::FlatB
     std::vector<fbs::EdgeEnd> edges;
     edges.reserve(edge_set.size());
     for (const auto& edge : edge_set)
-      edges.push_back(fbs::EdgeEnd(gsl::narrow<uint32_t>(edge.GetNode().Index()),
+      edges.push_back(fbs::EdgeEnd(narrow<uint32_t>(edge.GetNode().Index()),
                                    edge.GetSrcArgIndex(), edge.GetDstArgIndex()));
 
     return edges;
@@ -716,7 +717,7 @@ flatbuffers::Offset<fbs::NodeEdge> Node::SaveEdgesToOrtFormat(flatbuffers::FlatB
 
   const auto input_edges = get_edges(relationships_.input_edges);
   const auto output_edges = get_edges(relationships_.output_edges);
-  return fbs::CreateNodeEdgeDirect(builder, gsl::narrow<uint32_t>(index_), &input_edges, &output_edges);
+  return fbs::CreateNodeEdgeDirect(builder, narrow<uint32_t>(index_), &input_edges, &output_edges);
 }
 
 #endif  // !defined(ORT_MINIMAL_BUILD)
@@ -2802,7 +2803,7 @@ static void RemoveRepeatedFieldEntry(T& repeated_field, const TIter& entry_to_re
     // we do this so we don't have to move all the entries past the one being deleted down one.
     auto slot = entry_to_remove - repeated_field.begin();
     auto last_entry = repeated_field.end() - 1;
-    repeated_field.SwapElements(gsl::narrow<int>(slot), gsl::narrow<int>(num_entries - 1));
+    repeated_field.SwapElements(narrow<int>(slot), narrow<int>(num_entries - 1));
     repeated_field.erase(last_entry);
   } else {
     repeated_field.erase(entry_to_remove);
diff --git a/onnxruntime/core/graph/graph_flatbuffers_utils.cc b/onnxruntime/core/graph/graph_flatbuffers_utils.cc
index 54e03a985d..15ada5fe8c 100644
--- a/onnxruntime/core/graph/graph_flatbuffers_utils.cc
+++ b/onnxruntime/core/graph/graph_flatbuffers_utils.cc
@@ -1,16 +1,18 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <core/graph/graph.h>
+#include "graph_flatbuffers_utils.h"
+
+#include "flatbuffers/flatbuffers.h"
+
+#include "core/common/narrow.h"
 #include "core/flatbuffers/flatbuffers_utils.h"
 #include "core/flatbuffers/schema/ort.fbs.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/tensor_external_data_info.h"
-#include "graph_flatbuffers_utils.h"
-#include "flatbuffers/flatbuffers.h"
+#include "core/graph/graph.h"
 
 using namespace ONNX_NAMESPACE;
-using namespace ::onnxruntime::common;
 
 namespace onnxruntime::fbs::utils {
 
@@ -61,6 +63,7 @@ Status SaveInitializerOrtFormat(flatbuffers::FlatBufferBuilder& builder,
   return Status::OK();
 }
 
+#if !defined(DISABLE_SPARSE_TENSORS)
 Status SaveSparseInitializerOrtFormat(flatbuffers::FlatBufferBuilder& builder,
                                       const ONNX_NAMESPACE::SparseTensorProto& initializer,
                                       const Path& model_path,
@@ -87,6 +90,7 @@ Status SaveSparseInitializerOrtFormat(flatbuffers::FlatBufferBuilder& builder,
 
   return Status::OK();
 }
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
 
 #define GET_FBS_ATTR(BUILDER, TYPE, DATA_NAME, DATA) \
   fbs::AttributeBuilder attr_builder(BUILDER);       \
@@ -205,7 +209,7 @@ Status LoadInitializerOrtFormat(const fbs::Tensor& fbs_tensor, TensorProto& init
       // we reinterpret_cast this back to void* in tensorprotoutils.cc:GetExtDataFromTensorProto.
       // use intptr_t as OFFSET_TYPE is signed. in theory you could get a weird looking value if the address uses the
       // high bit, but that should be unlikely in a scenario where we care about memory usage enough to use this path.
-      auto offset = gsl::narrow<ExternalDataInfo::OFFSET_TYPE>(reinterpret_cast<intptr_t>(data_offset));
+      auto offset = narrow<ExternalDataInfo::OFFSET_TYPE>(reinterpret_cast<intptr_t>(data_offset));
 
       ONNX_NAMESPACE::StringStringEntryProto* entry = initializer.mutable_external_data()->Add();
       entry->set_key("location");
@@ -225,6 +229,7 @@ Status LoadInitializerOrtFormat(const fbs::Tensor& fbs_tensor, TensorProto& init
   return Status::OK();
 }
 
+#if !defined(DISABLE_SPARSE_TENSORS)
 Status LoadSparseInitializerOrtFormat(const fbs::SparseTensor& fbs_sparse_tensor,
                                       SparseTensorProto& initializer) {
   SparseTensorProto loaded_initializer;
@@ -248,6 +253,7 @@ Status LoadSparseInitializerOrtFormat(const fbs::SparseTensor& fbs_sparse_tensor
   swap(loaded_initializer, initializer);
   return Status::OK();
 }
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
 
 Status LoadAttributeOrtFormat(const fbs::Attribute& fbs_attr,
                               ONNX_NAMESPACE::AttributeProto& attr_proto,
diff --git a/onnxruntime/core/graph/graph_flatbuffers_utils.h b/onnxruntime/core/graph/graph_flatbuffers_utils.h
index eeb5353dd1..f1d7ab989c 100644
--- a/onnxruntime/core/graph/graph_flatbuffers_utils.h
+++ b/onnxruntime/core/graph/graph_flatbuffers_utils.h
@@ -3,10 +3,17 @@
 
 #pragma once
 
+#include <memory>
+
+#include "core/common/status.h"
+
 namespace ONNX_NAMESPACE {
-class TensorProto;
-class SparseTensorProto;
 class AttributeProto;
+class TensorProto;
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+class SparseTensorProto;
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
 }  // namespace ONNX_NAMESPACE
 
 namespace flatbuffers {
@@ -29,22 +36,27 @@ namespace fbs {
 struct Attribute;
 struct Tensor;
 
+#if !defined(DISABLE_SPARSE_TENSORS)
+struct SparseTensor;
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
+
 namespace utils {
 
-// TODO, add ORT_MUST_USE_RESULT when it is moved to a different header
-onnxruntime::common::Status SaveInitializerOrtFormat(
+Status SaveInitializerOrtFormat(
     flatbuffers::FlatBufferBuilder& builder, const ONNX_NAMESPACE::TensorProto& initializer,
     const Path& model_path, flatbuffers::Offset<fbs::Tensor>& fbs_tensor);
 
-onnxruntime::common::Status SaveSparseInitializerOrtFormat(
+#if !defined(DISABLE_SPARSE_TENSORS)
+Status SaveSparseInitializerOrtFormat(
     flatbuffers::FlatBufferBuilder& builder, const ONNX_NAMESPACE::SparseTensorProto& initializer,
     const Path& model_path, flatbuffers::Offset<fbs::SparseTensor>& fbs_sparse_tensor);
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
 
 // Convert a given AttributeProto into fbs::Attribute
 // Note, we current do not support graphs, and sparse_tensor(s)
 //       If the attribute type is a graph, we need to use the supplied Graph instance,
 //       instead of the GraphProto in attr_proto
-onnxruntime::common::Status SaveAttributeOrtFormat(
+Status SaveAttributeOrtFormat(
     flatbuffers::FlatBufferBuilder& builder, const ONNX_NAMESPACE::AttributeProto& attr_proto,
     flatbuffers::Offset<fbs::Attribute>& fbs_attr, const Path& model_path,
     const onnxruntime::Graph* subgraph);
@@ -59,22 +71,24 @@ onnxruntime::common::Status SaveAttributeOrtFormat(
 /// This requires the buffer to remain valid for the entire duration of the InferenceSession.
 /// </param>
 /// <returns>Status</returns>
-onnxruntime::common::Status LoadInitializerOrtFormat(const fbs::Tensor& fbs_tensor,
-                                                     ONNX_NAMESPACE::TensorProto& initializer,
-                                                     bool can_use_flatbuffer_for_initializers = false);
+Status LoadInitializerOrtFormat(const fbs::Tensor& fbs_tensor,
+                                ONNX_NAMESPACE::TensorProto& initializer,
+                                bool can_use_flatbuffer_for_initializers = false);
 
-onnxruntime::common::Status LoadSparseInitializerOrtFormat(const fbs::SparseTensor& fbs_sparse_tensor,
-                                                           ONNX_NAMESPACE::SparseTensorProto& initializer);
+#if !defined(DISABLE_SPARSE_TENSORS)
+Status LoadSparseInitializerOrtFormat(const fbs::SparseTensor& fbs_sparse_tensor,
+                                      ONNX_NAMESPACE::SparseTensorProto& initializer);
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
 
 // Load a give fbs::Attribute into AttributeProto
 // Note, If the attribute type is a graph, we will leave an empty graph in attr_proto,
 //       and set the deserialized Graph to the param graph
-onnxruntime::common::Status LoadAttributeOrtFormat(const fbs::Attribute& fbs_attr,
-                                                   ONNX_NAMESPACE::AttributeProto& attr_proto,
-                                                   std::unique_ptr<onnxruntime::Graph>& sub_graph,
-                                                   onnxruntime::Graph& graph, onnxruntime::Node& node,
-                                                   bool can_use_flatbuffer_for_initializers,
-                                                   const logging::Logger& logger);
+Status LoadAttributeOrtFormat(const fbs::Attribute& fbs_attr,
+                              ONNX_NAMESPACE::AttributeProto& attr_proto,
+                              std::unique_ptr<onnxruntime::Graph>& sub_graph,
+                              onnxruntime::Graph& graph, onnxruntime::Node& node,
+                              bool can_use_flatbuffer_for_initializers,
+                              const logging::Logger& logger);
 
 }  // namespace utils
 }  // namespace fbs
diff --git a/onnxruntime/core/graph/graph_utils.h b/onnxruntime/core/graph/graph_utils.h
index 0074d1afc2..4d1555d6ac 100644
--- a/onnxruntime/core/graph/graph_utils.h
+++ b/onnxruntime/core/graph/graph_utils.h
@@ -12,6 +12,7 @@
 #include "onnx/onnx-operators_pb.h"
 
 #include "core/common/inlined_containers.h"
+#include "core/common/span_utils.h"
 #include "core/graph/graph.h"
 
 namespace onnxruntime {
@@ -282,7 +283,7 @@ inline void FinalizeNodeFusion(Graph& graph,
                                std::initializer_list<std::reference_wrapper<Node>> nodes,
                                Node& replacement_node_start,
                                Node& replacement_node_end) {
-  FinalizeNodeFusion(graph, gsl::make_span(nodes), replacement_node_start, replacement_node_end);
+  FinalizeNodeFusion(graph, AsSpan(nodes), replacement_node_start, replacement_node_end);
 }
 
 /** Finalize the fusion of two or more nodes which are being replaced with a single node.
@@ -300,7 +301,7 @@ inline void FinalizeNodeFusion(Graph& graph, gsl::span<const std::reference_wrap
 }
 
 inline void FinalizeNodeFusion(Graph& graph, std::initializer_list<std::reference_wrapper<Node>> nodes, Node& replacement_node) {
-  FinalizeNodeFusion(graph, gsl::make_span(nodes.begin(), nodes.end()), replacement_node, replacement_node);
+  FinalizeNodeFusion(graph, AsSpan(nodes), replacement_node, replacement_node);
 }
 
 /** Find the source node of an input edge for a specified input index.
@@ -353,7 +354,7 @@ struct EdgeEndToMatch {
 bool FindPath(const Node& node, bool is_input_edge, gsl::span<const EdgeEndToMatch> edges_to_match, std::vector<const Node::EdgeEnd*>& result, const logging::Logger& logger);
 
 inline bool FindPath(const Node& node, bool is_input_edge, std::initializer_list<EdgeEndToMatch> edges_to_match, std::vector<const Node::EdgeEnd*>& result, const logging::Logger& logger) {
-  return FindPath(node, is_input_edge, gsl::make_span(edges_to_match), result, logger);
+  return FindPath(node, is_input_edge, AsSpan(edges_to_match), result, logger);
 }
 
 /** Same as FindPath above, but return the references of matched Node
@@ -361,7 +362,7 @@ inline bool FindPath(const Node& node, bool is_input_edge, std::initializer_list
 bool FindPath(Graph& graph, const Node& node, bool is_input_edge, gsl::span<const EdgeEndToMatch> edges_to_match, std::vector<std::reference_wrapper<Node>>& result, const logging::Logger& logger);
 
 inline bool FindPath(Graph& graph, const Node& node, bool is_input_edge, std::initializer_list<EdgeEndToMatch> edges_to_match, std::vector<std::reference_wrapper<Node>>& result, const logging::Logger& logger) {
-  return FindPath(graph, node, is_input_edge, gsl::make_span(edges_to_match), result, logger);
+  return FindPath(graph, node, is_input_edge, AsSpan(edges_to_match), result, logger);
 }
 
 /**
diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc
index 7b046dcfcb..4d7bce7f6d 100644
--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@@ -20,7 +20,7 @@
 #endif
 #include "core/util/protobuf_parsing_utils.h"
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "core/platform/env.h"
 
diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h
index cf9a5c66cc..49280ca9fa 100644
--- a/onnxruntime/core/graph/model.h
+++ b/onnxruntime/core/graph/model.h
@@ -13,7 +13,6 @@
 #if !defined(ORT_MINIMAL_BUILD)
 #include "core/graph/function_template.h"
 #endif
-#include "gsl/gsl"
 
 namespace flatbuffers {
 class FlatBufferBuilder;
diff --git a/onnxruntime/core/graph/model_load_utils.h b/onnxruntime/core/graph/model_load_utils.h
index f5aa45d191..ab1096ac29 100644
--- a/onnxruntime/core/graph/model_load_utils.h
+++ b/onnxruntime/core/graph/model_load_utils.h
@@ -6,7 +6,6 @@
 #include <unordered_map>
 #include <memory>
 #include <string>
-#include "gsl/gsl"
 #include "core/platform/env.h"
 #include "core/common/common.h"
 
diff --git a/onnxruntime/core/graph/node_attr_utils.h b/onnxruntime/core/graph/node_attr_utils.h
index 94242e8d26..461ec01d67 100644
--- a/onnxruntime/core/graph/node_attr_utils.h
+++ b/onnxruntime/core/graph/node_attr_utils.h
@@ -5,7 +5,7 @@
 
 #include <string>
 
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 
 #include "onnx/onnx_pb.h"
 
diff --git a/onnxruntime/core/graph/runtime_optimization_record_container.cc b/onnxruntime/core/graph/runtime_optimization_record_container.cc
index 9a4e705d9a..acd85b909e 100644
--- a/onnxruntime/core/graph/runtime_optimization_record_container.cc
+++ b/onnxruntime/core/graph/runtime_optimization_record_container.cc
@@ -7,7 +7,7 @@
 
 #include <algorithm>
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "core/flatbuffers/flatbuffers_utils.h"
 #include "core/flatbuffers/schema/ort.fbs.h"
diff --git a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
index 41c25e21b6..7b6f829b7a 100644
--- a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
@@ -1,7 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-#include "core/optimizer/initializer.h"
 #include "core/optimizer/embed_layer_norm_fusion.h"
+
+#include "core/common/span_utils.h"
+#include "core/optimizer/initializer.h"
 #include "core/graph/contrib_ops/contrib_defs.h"
 #include "core/graph/graph_utils.h"
 #include "core/optimizer/utils.h"
@@ -73,7 +75,7 @@ static bool IsNeighborNodeExpectedTypes(Node::NodeConstIterator start, const Nod
 }
 
 static inline bool IsNeighborNodeExpectedTypes(Node::NodeConstIterator start, const Node::NodeConstIterator end, std::initializer_list<std::string> expected_types) {
-  return IsNeighborNodeExpectedTypes(start, end, gsl::make_span(expected_types));
+  return IsNeighborNodeExpectedTypes(start, end, AsSpan(expected_types));
 }
 
 /** Match subgraph like the following:
diff --git a/onnxruntime/core/optimizer/free_dim_override_transformer.h b/onnxruntime/core/optimizer/free_dim_override_transformer.h
index 18e0b128b8..f9553339a7 100644
--- a/onnxruntime/core/optimizer/free_dim_override_transformer.h
+++ b/onnxruntime/core/optimizer/free_dim_override_transformer.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 
 #include "core/common/common.h"
 #include "core/optimizer/graph_transformer.h"
diff --git a/onnxruntime/core/optimizer/initializer.cc b/onnxruntime/core/optimizer/initializer.cc
index 50092d2fb8..8904a2995c 100644
--- a/onnxruntime/core/optimizer/initializer.cc
+++ b/onnxruntime/core/optimizer/initializer.cc
@@ -3,7 +3,7 @@
 
 #include "core/optimizer/initializer.h"
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "core/common/path.h"
 #include "core/framework/tensorprotoutils.h"
diff --git a/onnxruntime/core/optimizer/propagate_cast_ops.cc b/onnxruntime/core/optimizer/propagate_cast_ops.cc
index 61d4388aa9..64ec520740 100644
--- a/onnxruntime/core/optimizer/propagate_cast_ops.cc
+++ b/onnxruntime/core/optimizer/propagate_cast_ops.cc
@@ -1,7 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-#include "core/optimizer/initializer.h"
 #include "core/optimizer/propagate_cast_ops.h"
+
+#include "core/common/span_utils.h"
+#include "core/optimizer/initializer.h"
 #include "core/graph/graph_utils.h"
 #include "core/optimizer/utils.h"
 #include <deque>
@@ -458,12 +460,12 @@ static bool RemoveBackToBackCasts(Graph& graph, Node* parent,
         if (IsCastTo(child, TensorProto::FLOAT16)) {
           // The parent and child cancel out
           LOGS(logger, VERBOSE) << "RemoveBackToBackCasts: Removed Cast nodes  " << parent->Name() << " and " << child->Name();
-          ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, std::array{parent, child}, removed_nodes));
+          ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, AsSpan({parent, child}), removed_nodes));
           modified = true;
         } else if (IsCastTo(child, TensorProto::FLOAT)) {
           // Child is a duplicate of parent
           LOGS(logger, VERBOSE) << "RemoveBackToBackCasts: Removed Cast node  " << child->Name();
-          ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, {child}, removed_nodes));
+          ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, AsSpan({child}), removed_nodes));
           modified = true;
         }
       }
@@ -510,14 +512,14 @@ static bool RemoveBackToBackCasts(Graph& graph, Node* parent,
             // Child is a duplicate of parent
             LOGS(logger, VERBOSE) << "RemoveBackToBackCasts: Removed Cast node  " << child->Name();
             graph.RemoveEdge(parent->Index(), child->Index(), 0, 0);
-            ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, {child}, removed_nodes));
+            ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, AsSpan({child}), removed_nodes));
             modified = true;
           }
         }
       }
       if (children_count == 0) {
         // No more children nodes exists, and the parent-cast output is not a graph output. Remove it!
-        ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, {parent}, removed_nodes));
+        ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, AsSpan({parent}), removed_nodes));
       }
       if (!new_consumers.empty()) {
         auto consumers = graph.GetMutableConsumerNodes(parent_input->Name());
@@ -735,7 +737,7 @@ static bool PropagateForwards(Graph& graph, Node* node,
     }
     // Remove Cast operation
     LOGS(logger, VERBOSE) << "PropagateForwards: Removed Cast node  " << node->Name();
-    ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, {node}, removed_nodes));
+    ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, AsSpan({node}), removed_nodes));
     ORT_THROW_IF_ERROR(InsertCastNodes(graph, require_cast, false, removed_nodes, inserted_nodes));
     LOGS(logger, VERBOSE) << "PropagateForwards: Inserted Cast FP32 nodes "
                           << ConcatNames(require_cast, GetName);
@@ -781,7 +783,7 @@ static bool PropagateBackwards(Graph& graph, Node* node,
                             << ConcatNames(require_cast_fp32, GetName);
     }
     // Remove Cast operations
-    ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, {node}, removed_nodes));
+    ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, AsSpan({node}), removed_nodes));
     LOGS(logger, VERBOSE) << "PropagateBackwards: Removed Cast node  " << node->Name();
     ORT_THROW_IF_ERROR(InsertCastNodes(graph, require_cast, true, removed_nodes, inserted_nodes));
     LOGS(logger, VERBOSE) << "PropagateBackwards: Inserted Cast nodes "
@@ -920,7 +922,7 @@ static bool RemoveUnnecessaryCasts(Graph& graph, Node* node,
     TensorProto_DataType data_type = static_cast<TensorProto_DataType>(elem_type);
     if (IsCastTo(node, data_type)) {
       LOGS(logger, VERBOSE) << "Removed unnecessary cast " << node->Name();
-      ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, {node}, removed_nodes));
+      ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, AsSpan({node}), removed_nodes));
       modified = true;
     }
   }
@@ -1015,7 +1017,7 @@ static bool PropagateFP32CastsFromInputsToOutputs(Graph& graph, Node* node,
                               << ConcatNames(non_cast_producers_map, GetName);
       }
       for (Node* cast : casts) {
-        ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, {cast}, removed_nodes));
+        ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, AsSpan({cast}), removed_nodes));
       }
       LOGS(logger, VERBOSE) << "PropagateFP32CastsFromInputsToOutputs: Removed Cast nodes "
                             << ConcatNames(casts)
@@ -1122,7 +1124,7 @@ static bool PropagateFP16CastsFromOutputsToInputs(Graph& graph, Node* node,
                               << ConcatNames(non_cast_consumers_map, GetName);
       }
       for (Node* cast : casts) {
-        ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, {cast}, removed_nodes));
+        ORT_THROW_IF_ERROR(RemoveCastNodesChain(graph, AsSpan({cast}), removed_nodes));
       }
       LOGS(logger, VERBOSE) << "PropagateFP16CastsFromOutputsToInputs: Removed Cast nodes "
                             << ConcatNames(casts)
diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc
index 1898b6b290..7d5c6c209e 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc
@@ -6,6 +6,7 @@
 #include <vector>
 
 #include "core/common/common.h"
+#include "core/common/span_utils.h"
 #include "core/graph/graph.h"
 #include "core/graph/graph_utils.h"
 #include "core/optimizer/initializer.h"
@@ -54,7 +55,7 @@ bool IsQDQPairSupported(
   Initializer dq_scale(*dq_scale_tensor_proto, model_path);
 
   return q_zp.data_type() == dq_zp.data_type() &&
-         q_zp.DataAsByteSpan() == dq_zp.DataAsByteSpan() &&
+         SpanEq(q_zp.DataAsByteSpan(), dq_zp.DataAsByteSpan()) &&
          *q_scale.data<float>() == *dq_scale.data<float>();
 }
 
diff --git a/onnxruntime/core/optimizer/selectors_actions/actions.cc b/onnxruntime/core/optimizer/selectors_actions/actions.cc
index 2229a280a9..4fdfdf34d7 100644
--- a/onnxruntime/core/optimizer/selectors_actions/actions.cc
+++ b/onnxruntime/core/optimizer/selectors_actions/actions.cc
@@ -34,7 +34,7 @@ bool CanSafelyRemoveNode(const Node& node_to_remove, const InlinedHashSet<const
 void SafelyRemoveNodes(Graph& graph, gsl::span<Node* const> nodes_to_remove, const Node* ignore_target) {
   InlinedHashSet<const Node*> removal_set;
   removal_set.reserve(nodes_to_remove.size());
-  removal_set.insert(nodes_to_remove.cbegin(), nodes_to_remove.cend());
+  removal_set.insert(nodes_to_remove.begin(), nodes_to_remove.end());
 
   for (Node* node : nodes_to_remove) {
     if (node && node != ignore_target && CanSafelyRemoveNode(*node, removal_set)) {
diff --git a/onnxruntime/core/optimizer/selectors_actions/actions.h b/onnxruntime/core/optimizer/selectors_actions/actions.h
index 4bd4f6cadf..52ee2336b3 100644
--- a/onnxruntime/core/optimizer/selectors_actions/actions.h
+++ b/onnxruntime/core/optimizer/selectors_actions/actions.h
@@ -5,9 +5,8 @@
 
 #include <vector>
 
-#include "gsl/gsl"
-
 #include "core/common/common.h"
+#include "core/common/gsl.h"
 #include "core/graph/graph_utils.h"  // TODO: Minimize usage of this given we want to use Actions in a minimal build
 #include "core/graph/runtime_optimization_record.h"
 #include "core/optimizer/selectors_actions/helpers.h"
diff --git a/onnxruntime/core/optimizer/selectors_actions/helpers.cc b/onnxruntime/core/optimizer/selectors_actions/helpers.cc
index 9eb03badb8..0cfb0aeda0 100644
--- a/onnxruntime/core/optimizer/selectors_actions/helpers.cc
+++ b/onnxruntime/core/optimizer/selectors_actions/helpers.cc
@@ -3,6 +3,8 @@
 
 #include "core/optimizer/selectors_actions/helpers.h"
 
+#include "core/common/narrow.h"
+#include "core/common/span_utils.h"
 #include "core/optimizer/selectors_actions/actions.h"
 
 using namespace ONNX_NAMESPACE;
@@ -107,7 +109,7 @@ Status MoveInputOutputImpl(Graph& graph, const ValueMoveInfo& move_info, Node& s
   };
 
   if (move_info.copy_all) {
-    for (int i = 0, end = gsl::narrow<int>(src_defs.size()); i < end; ++i) {
+    for (int i = 0, end = narrow<int>(src_defs.size()); i < end; ++i) {
       ORT_RETURN_IF_ERROR(process(i));
     }
   } else {
@@ -132,7 +134,7 @@ bool GetNodesByNodeIndex(Graph& graph, gsl::span<const NodeIndex> indices, Inlin
   nodes.reserve(indices.size());
   bool missing = false;
 
-  for (auto iter = indices.cbegin(), end = indices.cend(); iter != end; ++iter) {
+  for (auto iter = indices.begin(), end = indices.end(); iter != end; ++iter) {
     nodes.push_back(GetNodeByNodeIndex(graph, *iter, missing));
 
     // bail if we're missing a node
@@ -261,7 +263,7 @@ InlinedVector<Node*> NodesToOptimize::Inputs(gsl::span<const int> indices, bool
   return results;
 }
 
-InlinedVector<Node*> NodesToOptimize::Outputs(const std::vector<int>& indices, bool required) const {
+InlinedVector<Node*> NodesToOptimize::Outputs(gsl::span<const int> indices, bool required) const {
   InlinedVector<Node*> results;
   results.reserve(NumOutputEntries());
 
@@ -283,9 +285,9 @@ InlinedVector<Node*> NodesToOptimize::Outputs(const std::vector<int>& indices, b
 
 InlinedVector<Node*> NodesToOptimize::GetNodesAtLocation(const NodeLocation& location, bool required) const {
   if (location.type == NodeType::kInput) {
-    return Inputs({location.index}, required);
+    return Inputs(AsSpan({location.index}), required);
   } else if (location.type == NodeType::kOutput) {
-    return Outputs({location.index}, required);
+    return Outputs(AsSpan({location.index}), required);
   } else {
     return {&Target()};
   }
@@ -309,7 +311,7 @@ Status MoveInputOutput(Graph& graph, Node& src, Node& dest, const ValueMoveInfo&
 }
 
 Status MoveInputOutput(Graph& graph, const NodesToOptimize& selected_nodes, Node& dest,
-                       const std::vector<NodeAndMoveInfo>& moves, bool only_update_dest_definitions) {
+                       gsl::span<const NodeAndMoveInfo> moves, bool only_update_dest_definitions) {
   for (const auto& move : moves) {
     auto src_nodes = selected_nodes.GetNodesAtLocation(move.src_node, !move.value_move_info.optional);
 
diff --git a/onnxruntime/core/optimizer/selectors_actions/helpers.h b/onnxruntime/core/optimizer/selectors_actions/helpers.h
index b72996e129..cf5489dc19 100644
--- a/onnxruntime/core/optimizer/selectors_actions/helpers.h
+++ b/onnxruntime/core/optimizer/selectors_actions/helpers.h
@@ -3,9 +3,8 @@
 
 #pragma once
 
-#include "gsl/gsl"
-
 #include "core/common/basic_types.h"
+#include "core/common/gsl.h"
 #include "core/common/inlined_containers.h"
 #include "core/graph/graph.h"
 #include "core/graph/runtime_optimization_record.h"
@@ -96,7 +95,7 @@ class NodesToOptimize {
   }
 
   // outputs filtered by index. includes all variadic.
-  InlinedVector<Node*> Outputs(const std::vector<int>& indices, bool required = true) const;
+  InlinedVector<Node*> Outputs(gsl::span<const int> indices, bool required = true) const;
 
   // Get the Node or Nodes (if variadic) at a specific index.
   InlinedVector<Node*> GetNodesAtLocation(const NodeLocation& location, bool required = true) const;
@@ -198,7 +197,7 @@ struct NodeAndMoveInfo {
 // rest of the graph. e.g., when creating a temporary node that is used to look up a kernel def, we can set the
 // temporary node's definitions (which is all we need) without updating existing graph edges.
 Status MoveInputOutput(Graph& graph, const NodesToOptimize& selected_nodes, Node& dest,
-                       const std::vector<NodeAndMoveInfo>& moves, bool only_update_dest_definitions);
+                       gsl::span<const NodeAndMoveInfo> moves, bool only_update_dest_definitions);
 
 Status MoveInputOutput(Graph& graph, Node& src, Node& dest, const ValueMoveInfo& move_info,
                        bool only_update_dest_definitions);
diff --git a/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc b/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc
index 31f6f89cb0..b4bc905801 100644
--- a/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc
+++ b/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc
@@ -166,7 +166,7 @@ std::optional<std::vector<int64_t>> ApiValueInfo::Shape() const {
   const auto dims = shape.GetDims();
   std::vector<int64_t> result;
   result.reserve(dims.size());
-  result.assign(dims.cbegin(), dims.cend());
+  result.assign(dims.begin(), dims.end());
   return result;
 }
 
@@ -253,7 +253,7 @@ void ApiValueInfo::UnsqueezeDims(const std::vector<int64_t>& axes) {
 std::vector<int64_t> ApiTensor::Shape() const {
   TensorShape shape = utils::GetTensorShapeFromTensorProto(tensor_proto_);
   const auto dims = shape.GetDims();
-  return std::vector<int64_t>{dims.cbegin(), dims.cend()};
+  return std::vector<int64_t>{dims.begin(), dims.end()};
 }
 
 size_t ApiTensor::NumElements() const {
diff --git a/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc b/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
index 794e459432..e8b71e14fb 100644
--- a/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
+++ b/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
@@ -4,7 +4,7 @@
 #include "optimizer_api.h"
 
 #include <algorithm>
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 #include <iostream>
 #include <unordered_map>
 #include <unordered_set>
diff --git a/onnxruntime/core/platform/env.cc b/onnxruntime/core/platform/env.cc
index 854c9d5214..5ebda75b7c 100644
--- a/onnxruntime/core/platform/env.cc
+++ b/onnxruntime/core/platform/env.cc
@@ -15,7 +15,6 @@ limitations under the License.
 // Portions Copyright (c) Microsoft Corporation
 
 #include "core/platform/env.h"
-#include "gsl/gsl"
 
 namespace onnxruntime {
 
@@ -36,15 +35,3 @@ std::ostream& operator<<(std::ostream& os, gsl::span<const LogicalProcessors> af
 Env::Env() = default;
 
 }  // namespace onnxruntime
-
-// This definition is provided to handle GSL failures in CUDA as
-// not throwing exception but calling a user-defined handler.
-// Otherwise gsl condition checks code does not compile even though
-// gsl may not be used in CUDA specific code.
-namespace gsl {
-gsl_api void fail_fast_assert_handler(
-    char const* const expression, char const* const message,
-    char const* const file, int line) {
-  ORT_ENFORCE(false, expression, file, line, message);
-}
-} // namespace gsl
diff --git a/onnxruntime/core/platform/env.h b/onnxruntime/core/platform/env.h
index c2fb51a575..208f66143d 100644
--- a/onnxruntime/core/platform/env.h
+++ b/onnxruntime/core/platform/env.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 
 #include "core/common/common.h"
 #include "core/common/path_string.h"
@@ -226,7 +226,7 @@ class Env {
   // library are platform-specific and are not documented here.
   //
   // global_symbols only has an effect on unix, where a value of true means to load with RTLD_GLOBAL vs RTLD_LOCAL
-  // 
+  //
   // On success, returns a handle to the library in "*handle" and returns
   // OK from the function.
   // Otherwise returns nullptr in "*handle" and an error status from the
diff --git a/onnxruntime/core/platform/path_lib.cc b/onnxruntime/core/platform/path_lib.cc
index aea20d1703..3d6c71977c 100644
--- a/onnxruntime/core/platform/path_lib.cc
+++ b/onnxruntime/core/platform/path_lib.cc
@@ -7,8 +7,6 @@
 #include <array>
 #include <algorithm>
 
-#include "gsl/gsl"
-
 #include "core/common/status.h"
 #include "core/common/common.h"
 #ifdef _WIN32
diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc
index 1e55ed1dce..3d1c28a26c 100644
--- a/onnxruntime/core/platform/posix/env.cc
+++ b/onnxruntime/core/platform/posix/env.cc
@@ -36,14 +36,14 @@ limitations under the License.
 #include <utility>  // for std::forward
 #include <vector>
 
-#include <gsl/gsl>
-
 #ifdef CPUINFO_SUPPORTED
 #include <cpuinfo.h>
 #endif
 
 #include "core/common/common.h"
+#include "core/common/gsl.h"
 #include "core/common/logging/logging.h"
+#include "core/common/narrow.h"
 #include "core/platform/scoped_resource.h"
 #include "core/platform/EigenNonBlockingThreadPool.h"
 
@@ -171,7 +171,7 @@ class PosixThread : public EnvThread {
     custom_join_thread_fn = thread_options.custom_join_thread_fn;
 
     auto param_ptr = std::make_unique<Param>(name_prefix, index, start_address, param);
-    if (gsl::narrow<size_t>(index) < thread_options.affinity.size()) {
+    if (narrow<size_t>(index) < thread_options.affinity.size()) {
       param_ptr->affinity = thread_options.affinity[index];
     }
 
@@ -280,7 +280,7 @@ class PosixEnv : public Env {
   int GetNumPhysicalCpuCores() const override {
 #ifdef CPUINFO_SUPPORTED
     if(cpuinfo_available_) {
-      return gsl::narrow<int>(cpuinfo_get_cores_count());
+      return narrow<int>(cpuinfo_get_cores_count());
     }
 #endif
     // We guess the number of cores
diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
index cac5157ad1..a96b1d1608 100644
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@@ -27,8 +27,9 @@ limitations under the License.
 #include <fcntl.h>
 #include <io.h>
 
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 #include "core/common/logging/logging.h"
+#include "core/common/narrow.h"
 #include "core/common/span_utils.h"
 #include "core/platform/env.h"
 #include "core/platform/scoped_resource.h"
@@ -93,7 +94,7 @@ class WindowsThread : public EnvThread {
     custom_join_thread_fn = thread_options.custom_join_thread_fn;
 
     std::unique_ptr<Param> local_param = std::make_unique<Param>(name_prefix, index, start_address, param);
-    if (gsl::narrow<size_t>(index) < thread_options.affinity.size()) {
+    if (narrow<size_t>(index) < thread_options.affinity.size()) {
       local_param->affinity = thread_options.affinity[index];
     }
 
@@ -248,7 +249,7 @@ class WindowsEnv : public Env {
       return {};
     }
 
-    const size_t count = gsl::narrow<size_t>(returnLength) / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+    const size_t count = narrow<size_t>(returnLength) / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
     std::optional<LogicalProcessorInformation> result;
     result = {std::move(allocation), gsl::make_span(buffer, count)};
     return result;
diff --git a/onnxruntime/core/platform/windows/stacktrace.cc b/onnxruntime/core/platform/windows/stacktrace.cc
index 4a802f0cc9..954c3c2a5d 100644
--- a/onnxruntime/core/platform/windows/stacktrace.cc
+++ b/onnxruntime/core/platform/windows/stacktrace.cc
@@ -10,7 +10,7 @@
 #include <DbgHelp.h>
 
 #include "core/common/logging/logging.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
 
diff --git a/onnxruntime/core/providers/cpu/controlflow/if.h b/onnxruntime/core/providers/cpu/controlflow/if.h
index ffbbc3185c..67bcdc569f 100644
--- a/onnxruntime/core/providers/cpu/controlflow/if.h
+++ b/onnxruntime/core/providers/cpu/controlflow/if.h
@@ -3,7 +3,6 @@
 
 #pragma once
 #include <functional>
-#include "gsl/gsl"
 
 #include "core/providers/cpu/controlflow/utils.h"
 
diff --git a/onnxruntime/core/providers/cpu/controlflow/loop.cc b/onnxruntime/core/providers/cpu/controlflow/loop.cc
index fac00d83b0..336ecd3844 100644
--- a/onnxruntime/core/providers/cpu/controlflow/loop.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/loop.cc
@@ -23,7 +23,7 @@
 #include "core/framework/TensorSeq.h"
 #include "core/providers/utils.h"
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #ifdef _MSC_VER
 #pragma warning(pop)
@@ -470,7 +470,7 @@ Status LoopImpl::ConcatenateLoopOutput(std::vector<OrtValue>& per_iteration_outp
 
   // first dimension is number of iterations
   dims.push_back(gsl::narrow_cast<int64_t>(per_iteration_output.size()));
-  std::copy(per_iteration_dims.cbegin(), per_iteration_dims.cend(), std::back_inserter(dims));
+  std::copy(per_iteration_dims.begin(), per_iteration_dims.end(), std::back_inserter(dims));
 
   TensorShape output_shape{dims};
   Tensor* output = context_.Output(output_index, output_shape);
@@ -604,7 +604,7 @@ Status LoopImpl::Execute(const FeedsFetchesManager& ffm) {
         const auto& dims = tensor_shape.GetDims();
 
         // copy to output dims and use 0 for any symbolic dim
-        std::for_each(dims.cbegin(), dims.cend(),
+        std::for_each(dims.begin(), dims.end(),
                       [&output_dims](const int64_t dim) { output_dims.push_back(dim < 0 ? 0 : dim); });
       } else {
         // TODO: We could try and call ExecuteGraph to get the output shape from fetches so the rank is correct,
diff --git a/onnxruntime/core/providers/cpu/controlflow/loop.h b/onnxruntime/core/providers/cpu/controlflow/loop.h
index 5cc1d39cf8..e34863f65c 100644
--- a/onnxruntime/core/providers/cpu/controlflow/loop.h
+++ b/onnxruntime/core/providers/cpu/controlflow/loop.h
@@ -3,7 +3,6 @@
 
 #pragma once
 #include <functional>
-#include "gsl/gsl"
 
 #include "core/common/common.h"
 #include "core/framework/feeds_fetches_manager.h"
diff --git a/onnxruntime/core/providers/cpu/controlflow/scan.h b/onnxruntime/core/providers/cpu/controlflow/scan.h
index c45b251045..14b39c25b8 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan.h
+++ b/onnxruntime/core/providers/cpu/controlflow/scan.h
@@ -3,7 +3,7 @@
 
 #pragma once
 #include <functional>
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #ifndef SHARED_PROVIDER
 #include "core/common/common.h"
diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_8.cc b/onnxruntime/core/providers/cpu/controlflow/scan_8.cc
index e0d180c8cc..d6294d28ad 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan_8.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_8.cc
@@ -293,7 +293,7 @@ Status Scan8Impl::ValidateInput() {
     }
 
     auto d = sequence_lens_tensor_->DataAsSpan<int64_t>();
-    sequence_lens_.assign(d.cbegin(), d.cend());
+    sequence_lens_.assign(d.begin(), d.end());
 
     if (!std::all_of(sequence_lens_.cbegin(), sequence_lens_.cend(),
                      [this](int64_t value) { return value > 0 && value <= max_sequence_len_; })) {
diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_9.cc b/onnxruntime/core/providers/cpu/controlflow/scan_9.cc
index 7ff4321058..163a68fc24 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan_9.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_9.cc
@@ -21,7 +21,7 @@
 #include "core/providers/cpu/tensor/utils.h"
 #include "core/providers/cpu/tensor/transpose.h"
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #ifdef _MSC_VER
 #pragma warning(pop)
diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc b/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc
index 515cecc01a..72291825fc 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc
@@ -11,8 +11,6 @@
 
 #include "core/providers/cpu/controlflow/scan_utils.h"
 
-#include "gsl/gsl"
-
 #include "core/framework/mldata_type_utils.h"
 #include "core/framework/op_kernel_context_internal.h"
 #include "core/framework/sequential_executor.h"
@@ -111,7 +109,7 @@ Status AllocateOutput(OpKernelContextInternal& context, const GraphViewer& subgr
     scan_output_dims.push_back(sequence_len);
   }
 
-  std::copy(graph_output_dims.cbegin(), graph_output_dims.cend(), std::back_inserter(scan_output_dims));
+  std::copy(graph_output_dims.begin(), graph_output_dims.end(), std::back_inserter(scan_output_dims));
 
   if (!temporary) {
     ORT_RETURN_IF_ERROR(OutputIterator::Create(context, output_index, is_loop_state_var, is_v8,
diff --git a/onnxruntime/core/providers/cpu/generator/constant_of_shape_base.h b/onnxruntime/core/providers/cpu/generator/constant_of_shape_base.h
index 25959a6a48..d96ff06e3d 100644
--- a/onnxruntime/core/providers/cpu/generator/constant_of_shape_base.h
+++ b/onnxruntime/core/providers/cpu/generator/constant_of_shape_base.h
@@ -57,7 +57,7 @@ class ConstantOfShapeBase {
 
     const auto span = shape_tensor->DataAsSpan<int64_t>();
 
-    TensorShape output_shape(span.begin(), span.size());
+    TensorShape output_shape(span);
     (*output_tensor) = ctx->Output(0, output_shape);
 
     return Status::OK();
diff --git a/onnxruntime/core/providers/cpu/generator/random.cc b/onnxruntime/core/providers/cpu/generator/random.cc
index 7d75fb31fe..3add38239e 100644
--- a/onnxruntime/core/providers/cpu/generator/random.cc
+++ b/onnxruntime/core/providers/cpu/generator/random.cc
@@ -25,8 +25,6 @@ limitations under the License.
 #include <chrono>
 #include <random>
 
-#include "gsl/gsl"
-
 #include "core/common/eigen_common_wrapper.h"
 #include "core/common/safeint.h"
 #include "core/framework/op_kernel_type_control_utils.h"
diff --git a/onnxruntime/core/providers/cpu/generator/random.h b/onnxruntime/core/providers/cpu/generator/random.h
index 3bf482d36e..7f5a761853 100644
--- a/onnxruntime/core/providers/cpu/generator/random.h
+++ b/onnxruntime/core/providers/cpu/generator/random.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include <random>
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc
index 4daf008b74..83b280993e 100644
--- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc
+++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc
@@ -3,6 +3,8 @@
 
 #include "einsum_typed_compute_processor.h"
 
+#include "core/common/span_utils.h"
+
 namespace onnxruntime {
 
 template <typename T>
@@ -83,7 +85,7 @@ static bool IsTransposeReshapeForEinsum(const gsl::span<const size_t>& perm,
       return false;
     last_permuted_axis = perm[i];
   }
-  new_shape.assign(input_dims.cbegin(), input_dims.cend());
+  new_shape.assign(input_dims.begin(), input_dims.end());
   for (size_t i = 0; i < perm.size(); ++i) {
     new_shape[i] = input_dims[perm[i]];
   }
@@ -162,13 +164,13 @@ std::unique_ptr<Tensor> EinsumTypedComputeProcessor<T>::PairwiseOperandProcess(c
         auto tensor_to_be_reduced_dims = current_left ? current_left->Shape().GetDims() : left_dims;
 
         current_left = EinsumOp::ReduceSum<T>(
-            tensor_to_be_reduced, tensor_to_be_reduced_dims, {i}, allocator_, tp_, einsum_ep_assets_, device_reduce_sum_func_);
+            tensor_to_be_reduced, tensor_to_be_reduced_dims, AsSpan({i}), allocator_, tp_, einsum_ep_assets_, device_reduce_sum_func_);
       } else if (has_right_dim) {
         const Tensor& tensor_to_be_reduced = current_right ? *current_right : right;
         auto tensor_to_be_reduced_dims = current_right ? current_right->Shape().GetDims() : right_dims;
 
         current_right = EinsumOp::ReduceSum<T>(
-            tensor_to_be_reduced, tensor_to_be_reduced_dims, {i}, allocator_, tp_, einsum_ep_assets_, device_reduce_sum_func_);
+            tensor_to_be_reduced, tensor_to_be_reduced_dims, AsSpan({i}), allocator_, tp_, einsum_ep_assets_, device_reduce_sum_func_);
       }
     } else {  // This dimension is not reduced (i.e.) it appears in the output after processing these 2 operands
       // Both the left and right operands have non-trivial dimension value along this axis
diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
index a7c05751d8..b62b04bda4 100644
--- a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
+++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
@@ -3,6 +3,7 @@
 
 #include "core/providers/cpu/math/element_wise_ops.h"
 
+#include "core/common/narrow.h"
 #include "core/framework/data_types_internal.h"
 #include "core/framework/math.h"
 #include "core/providers/cpu/tensor/utils.h"
@@ -495,7 +496,7 @@ void PowImpl(OpKernelContext& context) {
         auto Y = per_iter_bh.SpanInput1<E>();
         auto output = per_iter_bh.OutputSpan<T>();
 
-        std::transform(Y.cbegin(), Y.cend(), output.begin(),
+        std::transform(Y.begin(), Y.end(), output.begin(),
                        [X](E y) {
                          return static_cast<T>(std::pow(X, y));
                        });
@@ -507,18 +508,18 @@ void PowImpl(OpKernelContext& context) {
 
         // optimize for X^2 and X^3
         if (Y == 2) {
-          std::transform(X.cbegin(), X.cend(), output.begin(),
+          std::transform(X.begin(), X.end(), output.begin(),
                          [](T x) {
                            return static_cast<T>(x * x);
                          });
 
         } else if (Y == 3) {
-          std::transform(X.cbegin(), X.cend(), output.begin(),
+          std::transform(X.begin(), X.end(), output.begin(),
                          [](T x) {
                            return static_cast<T>(x * x * x);
                          });
         } else {
-          std::transform(X.cbegin(), X.cend(), output.begin(),
+          std::transform(X.begin(), X.end(), output.begin(),
                          [Y](T x) {
                            return static_cast<T>(std::pow(x, Y));
                          });
@@ -529,7 +530,7 @@ void PowImpl(OpKernelContext& context) {
         auto Y = per_iter_bh.SpanInput1<E>();
         auto output = per_iter_bh.OutputSpan<T>();
 
-        std::transform(X.cbegin(), X.cend(), Y.cbegin(), output.begin(),
+        std::transform(X.begin(), X.end(), Y.begin(), output.begin(),
                        [](T x, E y) {
                          return static_cast<T>(std::pow(x, y));
                        });
@@ -1330,8 +1331,8 @@ class Asinh final : public OpKernel {
     auto X_data = X.Data<float>();
     auto Y_data = Y.MutableData<float>();
 
-    auto in = gsl::make_span(X_data, gsl::narrow<ptrdiff_t>(X.Shape().Size()));
-    auto out = gsl::make_span(Y_data, gsl::narrow<ptrdiff_t>(Y.Shape().Size()));
+    auto in = gsl::make_span(X_data, narrow<ptrdiff_t>(X.Shape().Size()));
+    auto out = gsl::make_span(Y_data, narrow<ptrdiff_t>(Y.Shape().Size()));
 
     for (size_t index = 0; index < in.size(); ++index) {
       out[index] = std::asinh(in[index]);
@@ -1362,8 +1363,8 @@ class Acosh final : public OpKernel {
     auto X_data = X.Data<float>();
     auto Y_data = Y.MutableData<float>();
 
-    auto in = gsl::make_span(X_data, gsl::narrow<ptrdiff_t>(X.Shape().Size()));
-    auto out = gsl::make_span(Y_data, gsl::narrow<ptrdiff_t>(Y.Shape().Size()));
+    auto in = gsl::make_span(X_data, narrow<ptrdiff_t>(X.Shape().Size()));
+    auto out = gsl::make_span(Y_data, narrow<ptrdiff_t>(Y.Shape().Size()));
 
     for (size_t index = 0; index < in.size(); ++index) {
       out[index] = std::acosh(in[index]);
@@ -1394,8 +1395,8 @@ class Atanh final : public OpKernel {
     auto X_data = X.Data<float>();
     auto Y_data = Y.MutableData<float>();
 
-    auto in = gsl::make_span(X_data, gsl::narrow<ptrdiff_t>(X.Shape().Size()));
-    auto out = gsl::make_span(Y_data, gsl::narrow<ptrdiff_t>(Y.Shape().Size()));
+    auto in = gsl::make_span(X_data, narrow<ptrdiff_t>(X.Shape().Size()));
+    auto out = gsl::make_span(Y_data, narrow<ptrdiff_t>(Y.Shape().Size()));
 
     for (size_t index = 0; index < in.size(); ++index) {
       out[index] = std::atanh(in[index]);
@@ -1592,7 +1593,7 @@ void BroadCastFMod(OpKernelContext* context) {
         auto Y = per_iter_bh.SpanInput1<T>();
         auto output = per_iter_bh.OutputSpan<T>();
 
-        std::transform(Y.cbegin(), Y.cend(), output.begin(),
+        std::transform(Y.begin(), Y.end(), output.begin(),
                        [X](T y) {
                          return static_cast<T>(std::fmod(X, y));
                        });
@@ -1602,7 +1603,7 @@ void BroadCastFMod(OpKernelContext* context) {
         const T& Y = per_iter_bh.ScalarInput1<T>();
         auto output = per_iter_bh.OutputSpan<T>();
 
-        std::transform(X.cbegin(), X.cend(), output.begin(),
+        std::transform(X.begin(), X.end(), output.begin(),
                        [Y](T x) {
                          return static_cast<T>(std::fmod(x, Y));
                        });
@@ -1612,7 +1613,7 @@ void BroadCastFMod(OpKernelContext* context) {
         auto Y = per_iter_bh.SpanInput1<T>();
         auto output = per_iter_bh.OutputSpan<T>();
 
-        std::transform(X.cbegin(), X.cend(), Y.cbegin(), output.begin(),
+        std::transform(X.begin(), X.end(), Y.begin(), output.begin(),
                        [](T x, T y) {
                          return static_cast<T>(std::fmod(x, y));
                        });
@@ -1638,7 +1639,7 @@ void BroadCastMod(OpKernelContext* context) {
         auto Y = per_iter_bh.SpanInput1<T>();
         auto output = per_iter_bh.OutputSpan<T>();
 
-        std::transform(Y.cbegin(), Y.cend(), output.begin(),
+        std::transform(Y.begin(), Y.end(), output.begin(),
                        [X](T y) {
                          return Modulus(X, y);
                        });
@@ -1648,7 +1649,7 @@ void BroadCastMod(OpKernelContext* context) {
         const T& Y = per_iter_bh.ScalarInput1<T>();
         auto output = per_iter_bh.OutputSpan<T>();
 
-        std::transform(X.cbegin(), X.cend(), output.begin(),
+        std::transform(X.begin(), X.end(), output.begin(),
                        [Y](T x) {
                          return Modulus(x, Y);
                        });
@@ -1658,7 +1659,7 @@ void BroadCastMod(OpKernelContext* context) {
         auto Y = per_iter_bh.SpanInput1<T>();
         auto output = per_iter_bh.OutputSpan<T>();
 
-        std::transform(X.cbegin(), X.cend(), Y.cbegin(), output.begin(),
+        std::transform(X.begin(), X.end(), Y.begin(), output.begin(),
                        [](T x, T y) {
                          return Modulus(x, y);
                        });
@@ -1674,7 +1675,7 @@ void BroadCastMLFloat16FMod(OpKernelContext* context) {
         auto Y = per_iter_bh.SpanInput1<MLFloat16>();
         auto output = per_iter_bh.OutputSpan<MLFloat16>();
 
-        std::transform(Y.cbegin(), Y.cend(), output.begin(),
+        std::transform(Y.begin(), Y.end(), output.begin(),
                        [X_fl = math::halfToFloat(X.val)](const MLFloat16& y) {
                          return MLFloat16(math::floatToHalf(std::fmod(X_fl, math::halfToFloat(y.val))));
                        });
@@ -1684,7 +1685,7 @@ void BroadCastMLFloat16FMod(OpKernelContext* context) {
         const MLFloat16 Y = per_iter_bh.ScalarInput1<MLFloat16>();
         auto output = per_iter_bh.OutputSpan<MLFloat16>();
 
-        std::transform(X.cbegin(), X.cend(), output.begin(),
+        std::transform(X.begin(), X.end(), output.begin(),
                        [Y_fl = math::halfToFloat(Y.val)](const MLFloat16& x) {
                          return MLFloat16(math::floatToHalf(std::fmod(math::halfToFloat(x.val), Y_fl)));
                        });
@@ -1694,7 +1695,7 @@ void BroadCastMLFloat16FMod(OpKernelContext* context) {
         auto Y = per_iter_bh.SpanInput1<MLFloat16>();
         auto output = per_iter_bh.OutputSpan<MLFloat16>();
 
-        std::transform(X.cbegin(), X.cend(), Y.cbegin(), output.begin(),
+        std::transform(X.begin(), X.end(), Y.begin(), output.begin(),
                        [](const MLFloat16& x, const MLFloat16& y) {
                          auto x_fl = math::halfToFloat(x.val);
                          auto y_fl = math::halfToFloat(y.val);
diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.h b/onnxruntime/core/providers/cpu/math/element_wise_ops.h
index 7c7953015a..b7deea9625 100644
--- a/onnxruntime/core/providers/cpu/math/element_wise_ops.h
+++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.h
@@ -5,6 +5,7 @@
 
 #include "core/common/common.h"
 #include "core/common/inlined_containers.h"
+#include "core/common/narrow.h"
 #include "core/framework/op_kernel.h"
 #include "core/util/math_cpuonly.h"
 #include "core/providers/cpu/element_wise_ranged_transform.h"
@@ -456,12 +457,12 @@ class Erf final : public OpKernel {
 
 template <typename T>
 auto MakeEigenArrayMap(Tensor& t) -> EigenVectorArrayMap<T> {
-  return EigenVectorArrayMap<T>(t.MutableData<T>(), gsl::narrow<ptrdiff_t>(t.Shape().Size()));
+  return EigenVectorArrayMap<T>(t.MutableData<T>(), narrow<ptrdiff_t>(t.Shape().Size()));
 }
 
 template <typename T>
 auto MakeEigenArrayMap(const Tensor& t) -> ConstEigenVectorArrayMap<T> {
-  return ConstEigenVectorArrayMap<T>(t.Data<T>(), gsl::narrow<ptrdiff_t>(t.Shape().Size()));
+  return ConstEigenVectorArrayMap<T>(t.Data<T>(), narrow<ptrdiff_t>(t.Shape().Size()));
 }
 
 struct BroadcastIterator {
@@ -751,7 +752,7 @@ struct OutputBroadcaster {
   OutputBroadcaster(size_t span_size, Tensor& tensor, ptrdiff_t start_offset = 0, ptrdiff_t end_offset = 0)
       : element_size_(tensor.DataType()->Size()),
         span_size_(span_size) {
-    ptrdiff_t len = gsl::narrow<ptrdiff_t>(tensor.Shape().Size());
+    ptrdiff_t len = narrow<ptrdiff_t>(tensor.Shape().Size());
     ptrdiff_t real_end = (end_offset <= 0) ? len : end_offset;
     if (start_offset != 0 || end_offset != 0) {  // Keep original semantic
       ORT_ENFORCE(start_offset >= 0 && real_end >= 0 && start_offset <= real_end && real_end <= len,
diff --git a/onnxruntime/core/providers/cpu/math/gemm.cc b/onnxruntime/core/providers/cpu/math/gemm.cc
index 9c62d61ace..ac0e34b419 100644
--- a/onnxruntime/core/providers/cpu/math/gemm.cc
+++ b/onnxruntime/core/providers/cpu/math/gemm.cc
@@ -1,8 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <core/common/safeint.h>
+
 #include "core/providers/cpu/math/gemm.h"
+#include "core/common/narrow.h"
+#include "core/common/safeint.h"
 #include "core/providers/cpu/math/gemm_matmul_common.h"
 #include "core/util/math_cpuonly.h"
 #include "gemm_helper.h"
@@ -126,7 +128,7 @@ void Gemm<T>::ComputeGemm(CBLAS_TRANSPOSE trans_a, CBLAS_TRANSPOSE trans_b,
   GemmBroadcastBias(M, N, beta, c_data, c_shape, y_data);
 
   math::Gemm<T>(trans_a, trans_b,
-                gsl::narrow<ptrdiff_t>(M), gsl::narrow<ptrdiff_t>(N), gsl::narrow<ptrdiff_t>(K),
+                narrow<ptrdiff_t>(M), narrow<ptrdiff_t>(N), narrow<ptrdiff_t>(K),
                 alpha,
                 a_data,
                 b_data,
diff --git a/onnxruntime/core/providers/cpu/math/hardmax.h b/onnxruntime/core/providers/cpu/math/hardmax.h
index 277001780b..02b9b96fd3 100644
--- a/onnxruntime/core/providers/cpu/math/hardmax.h
+++ b/onnxruntime/core/providers/cpu/math/hardmax.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
diff --git a/onnxruntime/core/providers/cpu/math/sign.cc b/onnxruntime/core/providers/cpu/math/sign.cc
index 1684a09252..4e7c582cfc 100644
--- a/onnxruntime/core/providers/cpu/math/sign.cc
+++ b/onnxruntime/core/providers/cpu/math/sign.cc
@@ -3,7 +3,7 @@
 
 #include <type_traits>
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "core/common/common.h"
 #include "core/framework/data_types.h"
@@ -75,7 +75,7 @@ struct CallSignImpl<MLFloat16> {
   void operator()(const Tensor* input, Tensor* output) const {
     auto span = gsl::make_span(input->Data<MLFloat16>(), input->Shape().Size());
     auto output_data = output->MutableData<MLFloat16>();
-    std::transform(span.cbegin(), span.cend(), output_data, [](const MLFloat16& val) {
+    std::transform(span.begin(), span.end(), output_data, [](const MLFloat16& val) {
       float fl = math::halfToFloat(val.val);
       return MLFloat16(math::floatToHalf(FloatingImpl(fl)));
     });
@@ -87,7 +87,7 @@ struct CallSignImpl<BFloat16> {
   void operator()(const Tensor* input, Tensor* output) const {
     auto span = gsl::make_span(input->Data<BFloat16>(), input->Shape().Size());
     auto output_data = output->MutableData<BFloat16>();
-    std::transform(span.cbegin(), span.cend(), output_data, [](const BFloat16& val) {
+    std::transform(span.begin(), span.end(), output_data, [](const BFloat16& val) {
       float fl = val.ToFloat();
       return BFloat16(FloatingImpl(fl));
     });
diff --git a/onnxruntime/core/providers/cpu/math/softmax.h b/onnxruntime/core/providers/cpu/math/softmax.h
index dbcce64c7a..448a97bfbe 100644
--- a/onnxruntime/core/providers/cpu/math/softmax.h
+++ b/onnxruntime/core/providers/cpu/math/softmax.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "gsl/gsl-lite.hpp"
+#include "core/common/gsl.h"
 
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
diff --git a/onnxruntime/core/providers/cpu/math/softmax_shared.cc b/onnxruntime/core/providers/cpu/math/softmax_shared.cc
index b8eb8dae09..6329378b68 100644
--- a/onnxruntime/core/providers/cpu/math/softmax_shared.cc
+++ b/onnxruntime/core/providers/cpu/math/softmax_shared.cc
@@ -22,7 +22,7 @@
 
 #include <algorithm>
 #include <cmath>
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
diff --git a/onnxruntime/core/providers/cpu/ml/cast_map.cc b/onnxruntime/core/providers/cpu/ml/cast_map.cc
index aed38deaae..ab54b21376 100644
--- a/onnxruntime/core/providers/cpu/ml/cast_map.cc
+++ b/onnxruntime/core/providers/cpu/ml/cast_map.cc
@@ -3,7 +3,7 @@
 
 #include "core/providers/cpu/ml/cast_map.h"
 #include <algorithm>
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 using namespace ::onnxruntime::common;
 
 namespace {
diff --git a/onnxruntime/core/providers/cpu/ml/category_mapper.cc b/onnxruntime/core/providers/cpu/ml/category_mapper.cc
index 68ef3a2aae..d4d2c249d2 100644
--- a/onnxruntime/core/providers/cpu/ml/category_mapper.cc
+++ b/onnxruntime/core/providers/cpu/ml/category_mapper.cc
@@ -3,7 +3,7 @@
 
 #include "core/providers/cpu/ml/category_mapper.h"
 #include <algorithm>
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 using namespace ::onnxruntime::common;
 
 namespace onnxruntime {
@@ -38,7 +38,7 @@ Status CategoryMapper::Compute(OpKernelContext* context) const {
     // map isn't going to change so get end() once instead of calling inside the for_each loop
     const auto map_end = string_to_int_map_.end();
 
-    std::for_each(input.cbegin(), input.cend(),
+    std::for_each(input.begin(), input.end(),
                   [&out, &map_end, this](const std::string& value) {
                     auto map_to = string_to_int_map_.find(value);
                     *out = map_to == map_end ? default_int_ : map_to->second;
@@ -54,7 +54,7 @@ Status CategoryMapper::Compute(OpKernelContext* context) const {
 
     const auto map_end = int_to_string_map_.end();
 
-    std::for_each(input.cbegin(), input.cend(),
+    std::for_each(input.begin(), input.end(),
                   [&out, &map_end, this](const int64_t& value) {
                     auto map_to = int_to_string_map_.find(value);
                     *out = map_to == map_end ? default_string_ : map_to->second;
diff --git a/onnxruntime/core/providers/cpu/ml/feature_vectorizer.cc b/onnxruntime/core/providers/cpu/ml/feature_vectorizer.cc
index e48000c337..2685434f96 100644
--- a/onnxruntime/core/providers/cpu/ml/feature_vectorizer.cc
+++ b/onnxruntime/core/providers/cpu/ml/feature_vectorizer.cc
@@ -3,7 +3,7 @@
 
 #include "core/providers/cpu/ml/feature_vectorizer.h"
 
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
 namespace ml {
@@ -23,8 +23,8 @@ static void VectorizeTensor(const Tensor& input_tensor, int64_t feature_size, in
                             typename gsl::span<float>::iterator out_iter);
 
 template <typename T>
-static void CopyWithCast(typename gsl::span<const T>::const_iterator begin,
-                         typename gsl::span<const T>::const_iterator end,
+static void CopyWithCast(typename gsl::span<const T>::iterator begin,
+                         typename gsl::span<const T>::iterator end,
                          gsl::span<float>::iterator out_iter);
 
 Status FeatureVectorizer::Compute(OpKernelContext* context) const {
@@ -99,7 +99,7 @@ static void VectorizeTensor(const Tensor& input_tensor, int64_t feature_size, in
 
   auto data = input_tensor.Data<T>();
   auto input = gsl::make_span(data, shape.Size());
-  auto input_iter = input.cbegin();
+  auto input_iter = input.begin();
 
   for (int i = 0; i < N;) {
     // copy each row to the output. iters are passed by value
@@ -115,11 +115,11 @@ static void VectorizeTensor(const Tensor& input_tensor, int64_t feature_size, in
 }
 
 template <typename T>
-static void CopyWithCast(typename gsl::span<const T>::const_iterator begin,
-                         typename gsl::span<const T>::const_iterator end,
+static void CopyWithCast(typename gsl::span<const T>::iterator begin,
+                         typename gsl::span<const T>::iterator end,
                          gsl::span<float>::iterator out_iter) {
   std::for_each(begin, end,
-                [&out_iter](const typename gsl::span<T>::const_reference value) {
+                [&out_iter](const typename gsl::span<const T>::const_reference value) {
                   *out_iter = static_cast<float>(value);
                   ++out_iter;
                 });
diff --git a/onnxruntime/core/providers/cpu/ml/label_encoder.cc b/onnxruntime/core/providers/cpu/ml/label_encoder.cc
index fbe3ba160e..0bcce7e74a 100644
--- a/onnxruntime/core/providers/cpu/ml/label_encoder.cc
+++ b/onnxruntime/core/providers/cpu/ml/label_encoder.cc
@@ -3,7 +3,7 @@
 
 #include "core/providers/cpu/ml/label_encoder.h"
 #include <algorithm>
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 using namespace ::onnxruntime::common;
 
 namespace onnxruntime {
@@ -39,7 +39,7 @@ Status LabelEncoder::Compute(OpKernelContext* context) const {
     // map isn't going to change so get end() once instead of calling inside the for_each loop
     const auto map_end = string_to_int_map_.end();
 
-    std::for_each(input.cbegin(), input.cend(),
+    std::for_each(input.begin(), input.end(),
                   [&out, &map_end, this](const std::string& value) {
                     auto map_to = string_to_int_map_.find(value);
                     *out = map_to == map_end ? default_int_ : map_to->second;
@@ -55,7 +55,7 @@ Status LabelEncoder::Compute(OpKernelContext* context) const {
 
     const auto map_end = int_to_string_map_.end();
 
-    std::for_each(input.cbegin(), input.cend(),
+    std::for_each(input.begin(), input.end(),
                   [&out, &map_end, this](const int64_t& value) {
                     auto map_to = int_to_string_map_.find(value);
                     *out = map_to == map_end ? default_string_ : map_to->second;
diff --git a/onnxruntime/core/providers/cpu/ml/linearclassifier.cc b/onnxruntime/core/providers/cpu/ml/linearclassifier.cc
index e55ebc36b5..c8bac6be3a 100644
--- a/onnxruntime/core/providers/cpu/ml/linearclassifier.cc
+++ b/onnxruntime/core/providers/cpu/ml/linearclassifier.cc
@@ -55,9 +55,9 @@ void LinearClassifier::ComputeImpl(const gsl::span<const float> input,
   const float* input_data = input.data();
   auto scores_output_data = scores_output.MutableDataAsSpan<float>();
   size_t scores_output_size = num_batches * num_targets * (add_second_class ? 2 : 1);
-  ORT_ENFORCE(scores_output_data.length() >= scores_output_size,
+  ORT_ENFORCE(scores_output_data.size() >= scores_output_size,
               "Scores output is incorrect size. Expected:", scores_output_size,
-              " Found:", scores_output_data.length());
+              " Found:", scores_output_data.size());
 
   TensorShape intercepts_shape({num_targets});
   onnxruntime::Gemm<float>::ComputeGemm(CBLAS_TRANSPOSE::CblasNoTrans, CBLAS_TRANSPOSE::CblasTrans,
@@ -122,7 +122,7 @@ void LinearClassifier::ComputeImpl(const gsl::span<const float> input,
 template <typename SrcType>
 static void CastInputToFloat(const Tensor& in, gsl::span<float>& out) {
   size_t shape_size = static_cast<size_t>(in.Shape().Size());
-  ORT_ENFORCE(shape_size == out.length());
+  ORT_ENFORCE(shape_size == out.size());
 
   const SrcType* in_data = in.Data<SrcType>();
   float* out_data = out.data();
diff --git a/onnxruntime/core/providers/cpu/ml/ml_common.h b/onnxruntime/core/providers/cpu/ml/ml_common.h
index 3ec98d4347..2a0f261c22 100644
--- a/onnxruntime/core/providers/cpu/ml/ml_common.h
+++ b/onnxruntime/core/providers/cpu/ml/ml_common.h
@@ -264,7 +264,7 @@ static inline void ComputeSoftmax(gsl::span<T>& values) {
 
   // compute exp with negative number to be numerically stable
   float v_max = -std::numeric_limits<float>::max();
-  for (auto it = values.cbegin(); it != values.cend(); ++it) {
+  for (auto it = values.begin(); it != values.end(); ++it) {
     if (static_cast<float>(*it) > v_max)
       v_max = static_cast<float>(*it);
   }
@@ -282,7 +282,7 @@ template <typename T>
 static inline void ComputeSoftmaxZero(gsl::span<T>& values) {
   // compute exp with negative number to be numerically stable
   float v_max = -std::numeric_limits<float>::max();
-  for (auto it = values.cbegin(); it != values.cend(); ++it) {
+  for (auto it = values.begin(); it != values.end(); ++it) {
     if (static_cast<float>(*it) > v_max)
       v_max = static_cast<float>(*it);
   }
@@ -512,7 +512,7 @@ void batched_update_scores_inplace(gsl::span<T> scores, int64_t num_batches_in,
       } else {
         // reverse iteration as the scores are packed together and each score needs to be expanded to two
         const float* cur_in = s_end;
-        float* cur_out = &*scores.end();
+        float* cur_out = scores.data() + scores.size();
         while (cur_in > s) {
           --cur_in;
           cur_out -= 2;
diff --git a/onnxruntime/core/providers/cpu/ml/normalizer.cc b/onnxruntime/core/providers/cpu/ml/normalizer.cc
index 42e89ca382..68ca49dee1 100644
--- a/onnxruntime/core/providers/cpu/ml/normalizer.cc
+++ b/onnxruntime/core/providers/cpu/ml/normalizer.cc
@@ -4,7 +4,6 @@
 #include "core/providers/cpu/ml/normalizer.h"
 
 #include <algorithm>
-#include "gsl/gsl"
 
 /*
 ONNX_OPERATOR_SCHEMA(Normalizer)
diff --git a/onnxruntime/core/providers/cpu/ml/normalizer.h b/onnxruntime/core/providers/cpu/ml/normalizer.h
index 5172d1e6c6..f7811de0a1 100644
--- a/onnxruntime/core/providers/cpu/ml/normalizer.h
+++ b/onnxruntime/core/providers/cpu/ml/normalizer.h
@@ -7,8 +7,6 @@
 #include "core/framework/op_kernel.h"
 #include "core/providers/cpu/ml/ml_common.h"
 
-#include "gsl/gsl"
-
 namespace onnxruntime {
 namespace ml {
 
diff --git a/onnxruntime/core/providers/cpu/ml/svmclassifier.cc b/onnxruntime/core/providers/cpu/ml/svmclassifier.cc
index b91fb2ac35..fece9a1a49 100644
--- a/onnxruntime/core/providers/cpu/ml/svmclassifier.cc
+++ b/onnxruntime/core/providers/cpu/ml/svmclassifier.cc
@@ -342,11 +342,11 @@ Status SVMClassifier::ComputeImpl(OpKernelContext& ctx,
     int64_t maxclass = -1;
     if (votes_data.size() > 0) {
       auto votes = gsl::make_span<int64_t>(votes_data.data() + (n * class_count_), class_count_);
-      auto it_maxvotes = std::max_element(votes.cbegin(), votes.cend());
-      maxclass = std::distance(votes.cbegin(), it_maxvotes);
+      auto it_maxvotes = std::max_element(votes.begin(), votes.end());
+      maxclass = std::distance(votes.begin(), it_maxvotes);
     } else {
-      auto it_max_weight = std::max_element(cur_scores.cbegin(), cur_scores.cend());
-      maxclass = std::distance(cur_scores.cbegin(), it_max_weight);
+      auto it_max_weight = std::max_element(cur_scores.begin(), cur_scores.end());
+      maxclass = std::distance(cur_scores.begin(), it_max_weight);
       max_weight = *it_max_weight;
     }
 
diff --git a/onnxruntime/core/providers/cpu/nn/Unpool.cc b/onnxruntime/core/providers/cpu/nn/Unpool.cc
index 5ff4b5f686..fc5744e24c 100644
--- a/onnxruntime/core/providers/cpu/nn/Unpool.cc
+++ b/onnxruntime/core/providers/cpu/nn/Unpool.cc
@@ -7,6 +7,7 @@
 #pragma warning(disable : 4996)
 #endif
 #include "core/providers/cpu/nn/unpool.h"
+#include "core/common/narrow.h"
 #include "core/providers/cpu/tensor/utils.h"
 #include <cmath>
 
@@ -91,11 +92,11 @@ Status MaxUnpool::Compute(OpKernelContext* context) const {
 
   Tensor* Y = context->Output(0, shape);
   auto* Y_data = Y->MutableData<float>();
-  auto out = gsl::make_span(Y_data, gsl::narrow<size_t>(Y->Shape().Size()));
+  auto out = gsl::make_span(Y_data, narrow<size_t>(Y->Shape().Size()));
   std::fill_n(out.data(), out.size(), 0.f);
 
   for (auto cur_elem = 0; cur_elem < total_elements; ++cur_elem) {
-    out[gsl::narrow<size_t>(I_data[gsl::narrow<size_t>(cur_elem)])] = X_data[gsl::narrow<size_t>(cur_elem)];
+    out[narrow<size_t>(I_data[narrow<size_t>(cur_elem)])] = X_data[narrow<size_t>(cur_elem)];
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/cpu/nn/batch_norm.h b/onnxruntime/core/providers/cpu/nn/batch_norm.h
index f1e6cad3bf..67c2691944 100644
--- a/onnxruntime/core/providers/cpu/nn/batch_norm.h
+++ b/onnxruntime/core/providers/cpu/nn/batch_norm.h
@@ -19,6 +19,7 @@
 
 #include "core/common/common.h"
 #include "core/common/exceptions.h"
+#include "core/common/narrow.h"
 #include "core/framework/op_kernel.h"
 #include "core/providers/common.h"
 #include "core/framework/tensor.h"
@@ -81,7 +82,7 @@ class BatchNorm : public OpKernel {
     // calculate sample_size (per individual channel)
     size_t sample_size = 1;
     for (size_t i = 2; i < dims_vec.size(); ++i) {
-      sample_size *= gsl::narrow<size_t>(dims_vec[i]);
+      sample_size *= narrow<size_t>(dims_vec[i]);
     }
 
     // calculate sample_size (including all channels)
diff --git a/onnxruntime/core/providers/cpu/nn/conv.cc b/onnxruntime/core/providers/cpu/nn/conv.cc
index 527cec46dd..1a40b05cc3 100644
--- a/onnxruntime/core/providers/cpu/nn/conv.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv.cc
@@ -17,6 +17,7 @@
 
 #include "core/providers/cpu/nn/conv.h"
 
+#include "core/common/narrow.h"
 #include "core/common/safeint.h"
 #include "core/util/math_cpuonly.h"
 
@@ -274,9 +275,9 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
         math::Gemm<float>(
             CblasNoTrans,
             CblasNoTrans,
-            gsl::narrow<ptrdiff_t>(M / conv_attrs_.group),
-            gsl::narrow<ptrdiff_t>(output_image_size),
-            gsl::narrow<ptrdiff_t>(kernel_dim),
+            narrow<ptrdiff_t>(M / conv_attrs_.group),
+            narrow<ptrdiff_t>(output_image_size),
+            narrow<ptrdiff_t>(kernel_dim),
             1,
             W->Data<float>() + group_id * W_offset,
             col_buffer_data,
@@ -285,7 +286,7 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
             thread_pool);
       }
 
-      MlasActivation(&activation_, Ydata, Bdata, gsl::narrow<size_t>(M), gsl::narrow<size_t>(output_image_size), gsl::narrow<size_t>(output_image_size));
+      MlasActivation(&activation_, Ydata, Bdata, narrow<size_t>(M), narrow<size_t>(output_image_size), narrow<size_t>(output_image_size));
 
       Xdata += X_offset * conv_attrs_.group;
       Ydata += Y_offset * conv_attrs_.group;
diff --git a/onnxruntime/core/providers/cpu/nn/conv_attributes.h b/onnxruntime/core/providers/cpu/nn/conv_attributes.h
index e4064b4541..51a1e7acaf 100644
--- a/onnxruntime/core/providers/cpu/nn/conv_attributes.h
+++ b/onnxruntime/core/providers/cpu/nn/conv_attributes.h
@@ -44,7 +44,7 @@ struct ConvAttributes {
       // Pads are explicitly provided, make sure that auto_pad is NOTSET
       ORT_ENFORCE(auto_pad == AutoPadType::NOTSET,
                   "A Conv/ConvTranspose node has both 'auto_pad' and 'pads' attributes");
-      pads.assign(pads_span.cbegin(), pads_span.cend());
+      pads.assign(pads_span.begin(), pads_span.end());
     }
 
     status = info.GetAttrs("dilations", dilations);
diff --git a/onnxruntime/core/providers/cpu/nn/flatten.h b/onnxruntime/core/providers/cpu/nn/flatten.h
index 792c196928..22d339e8f3 100644
--- a/onnxruntime/core/providers/cpu/nn/flatten.h
+++ b/onnxruntime/core/providers/cpu/nn/flatten.h
@@ -5,7 +5,7 @@
 
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "core/providers/cpu/tensor/utils.h"
 #include "core/providers/common.h"
 
diff --git a/onnxruntime/core/providers/cpu/nn/lrn.h b/onnxruntime/core/providers/cpu/nn/lrn.h
index 136cd2b00e..e797ffda87 100644
--- a/onnxruntime/core/providers/cpu/nn/lrn.h
+++ b/onnxruntime/core/providers/cpu/nn/lrn.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "core/common/common.h"
 #include "core/common/exceptions.h"
diff --git a/onnxruntime/core/providers/cpu/nn/shrink.cc b/onnxruntime/core/providers/cpu/nn/shrink.cc
index 62c7e6def1..a0771d175a 100644
--- a/onnxruntime/core/providers/cpu/nn/shrink.cc
+++ b/onnxruntime/core/providers/cpu/nn/shrink.cc
@@ -57,7 +57,7 @@ template <>
 Status ShrinkImpl<MLFloat16>(const Tensor* input, Tensor* output, float bias, float lambd) {
   const auto& span = gsl::make_span(input->Data<MLFloat16>(), input->Shape().Size());
   auto* output_data = output->MutableData<MLFloat16>();
-  std::transform(span.cbegin(), span.cend(), output_data, [bias, lambd](const MLFloat16& val) {
+  std::transform(span.begin(), span.end(), output_data, [bias, lambd](const MLFloat16& val) {
     float fl = math::halfToFloat(val.val);
     return MLFloat16(math::floatToHalf(ShrinkCore<float>(fl, bias, lambd)));
   });
@@ -68,7 +68,7 @@ template <>
 Status ShrinkImpl<BFloat16>(const Tensor* input, Tensor* output, float bias, float lambd) {
   const auto& span = gsl::make_span(input->Data<BFloat16>(), input->Shape().Size());
   auto* output_data = output->MutableData<BFloat16>();
-  std::transform(span.cbegin(), span.cend(), output_data, [bias, lambd](const BFloat16& val) {
+  std::transform(span.begin(), span.end(), output_data, [bias, lambd](const BFloat16& val) {
     float fl = val.ToFloat();
     return BFloat16(ShrinkCore<float>(fl, bias, lambd));
   });
diff --git a/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc b/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc
index 464d027202..8bad3b872e 100644
--- a/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc
+++ b/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc
@@ -191,11 +191,11 @@ TfIdfVectorizer::TfIdfVectorizer(const OpKernelInfo& info) : OpKernel(info), imp
   ORT_ENFORCE(status.IsOK() && !impl_->ngram_indexes_.empty(), "Non-empty ngram_indexes is required");
   {
     // Check that all are positive
-    ORT_ENFORCE(std::all_of(impl_->ngram_indexes_.cbegin(), impl_->ngram_indexes_.cend(),
+    ORT_ENFORCE(std::all_of(impl_->ngram_indexes_.begin(), impl_->ngram_indexes_.end(),
                             [](int64_t i) { return i >= 0; }),
                 "Negative ngram_indexes values are not allowed");
     // Set output size to max output index + 1;
-    auto greatest_hit = std::max_element(impl_->ngram_indexes_.cbegin(), impl_->ngram_indexes_.cend());
+    auto greatest_hit = std::max_element(impl_->ngram_indexes_.begin(), impl_->ngram_indexes_.end());
     impl_->output_size_ = *greatest_hit + 1;
   }
 
diff --git a/onnxruntime/core/providers/cpu/quantization/quantize_linear_matmul.cc b/onnxruntime/core/providers/cpu/quantization/quantize_linear_matmul.cc
index b1a8d0c16b..cb162ade44 100644
--- a/onnxruntime/core/providers/cpu/quantization/quantize_linear_matmul.cc
+++ b/onnxruntime/core/providers/cpu/quantization/quantize_linear_matmul.cc
@@ -3,9 +3,10 @@
 
 #include "quantize_linear_matmul.h"
 
+#include "core/common/narrow.h"
+#include "core/common/safeint.h"
 #include "core/framework/op_kernel.h"
 #include "core/providers/cpu/math/matmul_helper.h"
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/util/math_cpuonly.h"
 #include "core/util/qmath.h"
@@ -90,9 +91,9 @@ Status QLinearMatMul::Compute(OpKernelContext* ctx) const {
   auto y_scale_data = *(y_scale->Data<float>());
 
   const int64_t output_scale_size = b_scale->Shape().Size();
-  std::vector<float> output_scales(gsl::narrow<size_t>(output_scale_size));
+  std::vector<float> output_scales(narrow<size_t>(output_scale_size));
   for (int64_t i = 0; i < output_scale_size; i++) {
-    output_scales[gsl::narrow<size_t>(i)] = (a_scale_data * b_scale_data[gsl::narrow<size_t>(i)] / y_scale_data);
+    output_scales[narrow<size_t>(i)] = (a_scale_data * b_scale_data[narrow<size_t>(i)] / y_scale_data);
   }
 
   const size_t num_gemms = helper.OutputOffsets().size();
diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
index c18d6a5ad2..ff153f1977 100644
--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
@@ -1,8 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/inlined_containers.h"
 #include "core/providers/cpu/reduction/reduction_ops.h"
+
+#include "core/common/inlined_containers.h"
+#include "core/common/narrow.h"
+#include "core/common/span_utils.h"
 #include "core/providers/common.h"
 //TODO: fix the warnings
 #if defined(_MSC_VER) && !defined(__clang__)
@@ -231,9 +234,9 @@ bool operator!=(FastReduceKind a, FastReduceKind b) {
 
 bool ResultsNoTransposePrepareForReduce::equal(gsl::span<const int64_t> local_input_shape,
                                                gsl::span<const int64_t> local_reduced_axes) {
-  if (gsl::make_span(input_shape) != local_input_shape)
+  if (!SpanEq(gsl::make_span(input_shape), local_input_shape))
     return false;
-  if (gsl::make_span(reduced_axes) != local_reduced_axes)
+  if (!SpanEq(gsl::make_span(reduced_axes), local_reduced_axes))
     return false;
   return true;
 }
@@ -556,7 +559,7 @@ FastReduceKind OptimizeShapeForFastReduce(gsl::span<const int64_t> input_shape,
   }
 
   InlinedHashSet<int64_t> axes;
-  const auto input_shape_size = gsl::narrow<int64_t>(input_shape.size());
+  const auto input_shape_size = narrow<int64_t>(input_shape.size());
   if (reduced_axes.size() == 0 && !noop_with_empty_axes) {
     for (int64_t i = 0; i < input_shape_size; ++i) {
       axes.insert(i);
@@ -594,7 +597,7 @@ FastReduceKind OptimizeShapeForFastReduce(gsl::span<const int64_t> input_shape,
     }
     if (noop_with_empty_axes) {
       fast_axes.clear();
-      fast_output_shape.assign(input_shape.cbegin(), input_shape.cend());
+      fast_output_shape.assign(input_shape.begin(), input_shape.end());
       return FastReduceKind::kK;
     } else {
       if (keep_dims) {
diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
index e53dbb4cf0..f9b598b5a1 100644
--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
@@ -281,9 +281,9 @@ Status DeepCpuGruOp::ComputeImpl(OpKernelContext& context) const {
 
   auto& X_shape = X.Shape();
 
-  int seq_length = gsl::narrow<int>(X_shape[0]);
-  int batch_size = gsl::narrow<int>(X_shape[1]);
-  int input_size = gsl::narrow<int>(X_shape[2]);
+  int seq_length = narrow<int>(X_shape[0]);
+  int batch_size = narrow<int>(X_shape[1]);
+  int input_size = narrow<int>(X_shape[2]);
 
   auto status = ValidateCommonRnnInputs(X, W.Shape(), R.Shape(), B, 3, sequence_lens, initial_h, num_directions_, hidden_size_);
   ORT_RETURN_IF_ERROR(status);
@@ -462,7 +462,7 @@ UniDirectionalGru<T>::UniDirectionalGru(AllocatorPtr allocator,
       }
 
       // replicate what we just wrote to the start of the output span so we have batch_size_ copies
-      auto values = output.cbegin();
+      auto values = output.begin();
       ORT_IGNORE_RETURN_VALUE(RepeatVectorToConstructArray(values, values + hidden_size_,
                                                            output.begin() + hidden_size_,  // skip the first batch
                                                            batch_size_ - 1));              // and replicate batch size - 1 times
@@ -475,8 +475,8 @@ UniDirectionalGru<T>::UniDirectionalGru(AllocatorPtr allocator,
     // how we treat the h weight depends on whether linear_before_reset_ is set
     if (linear_before_reset_) {
       // need to replicate Wb[o] and Rb[o] separately
-      ORT_IGNORE_RETURN_VALUE(RepeatVectorToConstructArray(bias_Wo.cbegin(), bias_Wo.cend(), batched_bias_Wh_.begin(), batch_size_));
-      ORT_IGNORE_RETURN_VALUE(RepeatVectorToConstructArray(bias_Ro.cbegin(), bias_Ro.cend(), batched_bias_Rh_.begin(), batch_size_));
+      ORT_IGNORE_RETURN_VALUE(RepeatVectorToConstructArray(bias_Wo.begin(), bias_Wo.end(), batched_bias_Wh_.begin(), batch_size_));
+      ORT_IGNORE_RETURN_VALUE(RepeatVectorToConstructArray(bias_Ro.begin(), bias_Ro.end(), batched_bias_Rh_.begin(), batch_size_));
     } else {
       combine_and_replicate(bias_Wo, bias_Ro, batched_bias_WRh_);
     }
@@ -495,7 +495,7 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
                                    const gsl::span<const T>& recurrent_weights,
                                    gsl::span<T>& outputs,
                                    gsl::span<T>& final_hidden_state) {
-  using span_T_const_iter = typename gsl::span<T>::const_iterator;
+  using span_T_const_iter = typename gsl::span<const T>::iterator;
   using span_T_iter = typename gsl::span<T>::iterator;
 
   // copy inputs_arg as we may change it to point to inputs_reverse_
@@ -530,9 +530,9 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
   }
 
   // Calculate the max and min length
-  int32_t max_sequence_length = *std::max_element(sequence_lengths.cbegin(), sequence_lengths.cend());
-  int32_t min_sequence_length = std::min(seq_length_, *std::min_element(sequence_lengths.cbegin(),
-                                                                        sequence_lengths.cend()));
+  int32_t max_sequence_length = *std::max_element(sequence_lengths.begin(), sequence_lengths.end());
+  int32_t min_sequence_length = std::min(seq_length_, *std::min_element(sequence_lengths.begin(),
+                                                                        sequence_lengths.end()));
 
   const int hidden_size_x2 = 2 * hidden_size_;
   const int hidden_size_x3 = 3 * hidden_size_;
@@ -542,9 +542,9 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
 
   // apply weights to all the inputs
   ComputeGemm(total_rows, hidden_size_x3, input_size_, alpha,
-              inputs.cbegin(), inputs.cend(),
+              inputs.begin(), inputs.end(),
               input_size_,
-              input_weights.cbegin(), input_weights.cend(),
+              input_weights.begin(), input_weights.end(),
               input_size_, 0.f,
               outputZRH_.begin(), outputZRH_.end(),
               hidden_size_x3, ttp_);
@@ -562,16 +562,16 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
     output_step_length = 2 * batch_size_ * hidden_size_;
 
   // convenience end iterators we use in the loops below to detect any bounds issues
-  span_T_const_iter batched_bias_WRz_local_end = batched_bias_WRz_.cend();
-  span_T_const_iter batched_bias_WRr_local_end = batched_bias_WRr_.cend();
-  span_T_const_iter batched_bias_Wh_local_end = batched_bias_Wh_.cend();
-  span_T_const_iter batched_bias_Rh_local_end = batched_bias_Rh_.cend();
-  span_T_const_iter batched_bias_WRh_local_end = batched_bias_WRh_.cend();
+  span_T_const_iter batched_bias_WRz_local_end = batched_bias_WRz_.end();
+  span_T_const_iter batched_bias_WRr_local_end = batched_bias_WRr_.end();
+  span_T_const_iter batched_bias_Wh_local_end = batched_bias_Wh_.end();
+  span_T_const_iter batched_bias_Rh_local_end = batched_bias_Rh_.end();
+  span_T_const_iter batched_bias_WRh_local_end = batched_bias_WRh_.end();
 
   size_t out_added_offset;
 
-  span_T_const_iter prev_Ht = batched_hidden0_.cbegin();  // Ht-1
-  span_T_const_iter prev_Ht_end = batched_hidden0_.cend();
+  span_T_const_iter prev_Ht = batched_hidden0_.begin();  // Ht-1
+  span_T_const_iter prev_Ht_end = batched_hidden0_.end();
   span_T_iter cur_h_local = cur_h_.begin();
   span_T_iter cur_h_local_end = cur_h_.end();
 
@@ -582,14 +582,14 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
   span_T_const_iter batched_bias_Rh_local{};
 
   if (use_bias_) {
-    batched_bias_WRz_local = batched_bias_WRz_.cbegin();
-    batched_bias_WRr_local = batched_bias_WRr_.cbegin();
+    batched_bias_WRz_local = batched_bias_WRz_.begin();
+    batched_bias_WRr_local = batched_bias_WRr_.begin();
 
     if (linear_before_reset_) {
-      batched_bias_Wh_local = batched_bias_Wh_.cbegin();
-      batched_bias_Rh_local = batched_bias_Rh_.cbegin();
+      batched_bias_Wh_local = batched_bias_Wh_.begin();
+      batched_bias_Rh_local = batched_bias_Rh_.begin();
     } else {
-      batched_bias_WRh_local = batched_bias_WRh_.cbegin();
+      batched_bias_WRh_local = batched_bias_WRh_.begin();
     }
   }
 
@@ -614,7 +614,7 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
       ComputeGemm(batch_size_, hidden_size_x2, hidden_size_, alpha,
                   prev_Ht, prev_Ht_end,
                   hidden_size_,
-                  recurrent_weightsZR.cbegin(), recurrent_weightsZR.cend(),
+                  recurrent_weightsZR.begin(), recurrent_weightsZR.end(),
                   hidden_size_, 1.f,  // beta == 1 so we add existing values in outputZRH_
                   outputZRH_.begin() + out_added_offset, outputZRH_.end(),
                   hidden_size_x3, ttp_);
@@ -634,7 +634,7 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
         ComputeGemm(batch_size_, hidden_size_, hidden_size_, alpha,
                     prev_Ht, prev_Ht_end,  // Ht-1
                     hidden_size_,
-                    recurrent_weightsH.cbegin(), recurrent_weightsH.cend(),  // Rh^T
+                    recurrent_weightsH.begin(), recurrent_weightsH.end(),  // Rh^T
                     hidden_size_,
                     use_bias_ ? 1.f : 0.f,  // don't add values in linear_output_ if no bias input
                     linear_output_.begin(),
@@ -707,7 +707,7 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
         ComputeGemm(batch_size_, hidden_size_, hidden_size_, alpha,
                     cur_h_local, cur_h_local_end,  // rt (.) Ht-1
                     hidden_size_,
-                    recurrent_weightsH.cbegin(), recurrent_weightsH.cend(),  // Rh^T
+                    recurrent_weightsH.begin(), recurrent_weightsH.end(),  // Rh^T
                     hidden_size_, 1.f,                                       // beta == 1 to add Xt*(Wh^T) from out_H
                     out_H, outputZRH_.end(),
                     hidden_size_x3, ttp_);
diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.h b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.h
index 885675f3b7..8d805f2a17 100644
--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.h
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.h
@@ -5,6 +5,7 @@
 
 #include <limits>
 
+#include "core/common/narrow.h"
 #include "core/framework/op_kernel.h"
 #include "core/providers/cpu/rnn/rnn_helpers.h"
 
@@ -21,10 +22,10 @@ class DeepCpuGruOp final : public OpKernel {
 
     int64_t int64_value;
     ORT_ENFORCE(info.GetAttr("linear_before_reset", &int64_value).IsOK());
-    linear_before_reset_ = gsl::narrow<int>(int64_value);
+    linear_before_reset_ = narrow<int>(int64_value);
 
     ORT_ENFORCE(info.GetAttr("hidden_size", &int64_value).IsOK() && int64_value > 0);
-    hidden_size_ = gsl::narrow<int>(int64_value);
+    hidden_size_ = narrow<int>(int64_value);
 
     // optional attributes
     std::vector<std::string> activation_func_names = info.GetAttrsOrDefault<std::string>("activations");
diff --git a/onnxruntime/core/providers/cpu/rnn/lstm_base.cc b/onnxruntime/core/providers/cpu/rnn/lstm_base.cc
index cfabe3fb45..777a461624 100644
--- a/onnxruntime/core/providers/cpu/rnn/lstm_base.cc
+++ b/onnxruntime/core/providers/cpu/rnn/lstm_base.cc
@@ -3,6 +3,7 @@
 
 #include "lstm_base.h"
 #include "uni_directional_lstm.h"
+#include "core/common/narrow.h"
 //TODO: fix the warnings
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(disable : 26451)
@@ -39,9 +40,9 @@ Status LSTMBase::ComputeImpl(OpKernelContext& context,
 
   const auto& X_shape = X.Shape();
 
-  int seq_length = gsl::narrow<int>(X_shape[0]);
-  int batch_size = gsl::narrow<int>(X_shape[1]);
-  int input_size = gsl::narrow<int>(X_shape[2]);
+  int seq_length = narrow<int>(X_shape[0]);
+  int batch_size = narrow<int>(X_shape[1]);
+  int input_size = narrow<int>(X_shape[2]);
 
   Status status = ValidateInputs(X, B, sequence_lens, initial_h, initial_c, P);
   ORT_RETURN_IF_ERROR(status);
@@ -208,8 +209,8 @@ Status LSTMBase::ValidateInputs(const Tensor& X,
     }
 
     auto sequence_len_entries = sequence_lens->DataAsSpan<int>();
-    if (std::any_of(sequence_len_entries.cbegin(),
-                    sequence_len_entries.cend(),
+    if (std::any_of(sequence_len_entries.begin(),
+                    sequence_len_entries.end(),
                     [seq_length](int len) { return len < 0 || len > seq_length; })) {
       return ORT_MAKE_STATUS(
           ONNXRUNTIME, INVALID_ARGUMENT,
diff --git a/onnxruntime/core/providers/cpu/rnn/lstm_base.h b/onnxruntime/core/providers/cpu/rnn/lstm_base.h
index 2d9061fb1e..2ae13771cd 100644
--- a/onnxruntime/core/providers/cpu/rnn/lstm_base.h
+++ b/onnxruntime/core/providers/cpu/rnn/lstm_base.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include "core/common/narrow.h"
 #include "core/framework/op_kernel.h"
 #include "core/providers/cpu/rnn/rnn_helpers.h"
 
@@ -20,7 +21,7 @@ class LSTMBase {
 
     int64_t int64_value;
     ORT_ENFORCE(info.GetAttr("hidden_size", &int64_value).IsOK() && int64_value > 0);
-    hidden_size_ = gsl::narrow<int>(int64_value);
+    hidden_size_ = narrow<int>(int64_value);
 
     // optional attributes
     std::vector<std::string> activation_func_names = info.GetAttrsOrDefault<std::string>("activations");
diff --git a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.cc b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.cc
index ece449095c..0f74c16203 100644
--- a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.cc
+++ b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.cc
@@ -79,8 +79,8 @@ Status ValidateCommonRnnInputs(const Tensor& X,
     }
 
     auto sequence_len_entries = sequence_lens->DataAsSpan<int>();
-    if (std::any_of(sequence_len_entries.cbegin(),
-                    sequence_len_entries.cend(),
+    if (std::any_of(sequence_len_entries.begin(),
+                    sequence_len_entries.end(),
                     [seq_length](int len) { return len < 0 || len > seq_length; })) {
       return ORT_MAKE_STATUS(
           ONNXRUNTIME, INVALID_ARGUMENT,
diff --git a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h
index 95a8d87cc7..076e20430e 100644
--- a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h
+++ b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h
@@ -18,7 +18,7 @@
 #include "core/common/safeint.h"
 #include "core/platform/threadpool.h"
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
 namespace rnn {
@@ -247,11 +247,39 @@ void ComputeGemm(const int M,
                  int32_t* quantize_agg_C_buffer,
                  concurrency::ThreadPool* thread_pool);
 
+// helper to call the above pointer versions with spans
+template <typename GemmWeightsType>
+inline void ComputeGemm(const int M,
+                        const int N,
+                        const int K,
+                        const float alpha,
+                        gsl::span<const float> A_span,
+                        const GemmWeights<GemmWeightsType>& weights,
+                        const float beta,
+                        gsl::span<float> C_span,
+                        const int ldc,
+                        uint8_t* quantized_A_buffer,
+                        int32_t* quantize_agg_C_buffer,
+                        concurrency::ThreadPool* thread_pool) {
+  ComputeGemm(M,
+              N,
+              K,
+              alpha,
+              A_span.data(), A_span.data() + A_span.size(),
+              weights,
+              beta,
+              C_span.data(), C_span.data() + C_span.size(),
+              ldc,
+              quantized_A_buffer,
+              quantize_agg_C_buffer,
+              thread_pool);
+}
+
 // helper to convert a span to a raw pointer
 // after validating the memory covered by the span supports the size required
 template <typename T>
-const T* SafeRawConstPointer(typename gsl::span<T>::const_iterator cur,
-                             typename gsl::span<T>::const_iterator end,
+const T* SafeRawConstPointer(typename gsl::span<const T>::iterator cur,
+                             typename gsl::span<const T>::iterator end,
                              size_t size) {
   ORT_ENFORCE(cur + size <= end);
   return &*cur;
@@ -260,7 +288,7 @@ const T* SafeRawConstPointer(typename gsl::span<T>::const_iterator cur,
 // helper to convert a span to a raw pointer
 // after validating the memory covered by the span supports the size required
 template <typename T>
-const T* SafeRawConstPointer(gsl::span<T> span, size_t offset, size_t size) {
+const T* SafeRawConstPointer(gsl::span<const T> span, size_t offset, size_t size) {
   ORT_ENFORCE(offset + size <= size_t(span.size()));
   return span.data();
 }
diff --git a/onnxruntime/core/providers/cpu/rnn/uni_directional_lstm.cc b/onnxruntime/core/providers/cpu/rnn/uni_directional_lstm.cc
index 605ebacaee..5316b825c7 100644
--- a/onnxruntime/core/providers/cpu/rnn/uni_directional_lstm.cc
+++ b/onnxruntime/core/providers/cpu/rnn/uni_directional_lstm.cc
@@ -234,7 +234,7 @@ void UniDirectionalLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
   }
 
   // LSTM Layer
-  gsl::span<T> batched_hidden_state_one_step = batched_hidden0_;
+  gsl::span<const T> batched_hidden_state_one_step = batched_hidden0_;
   gsl::span<T> batched_internal_state_prev_one_step = batched_internal_memory_prev_;
   gsl::span<T> batched_internal_state_clipped_one_step = batched_internal_memory_clipped_;
 
@@ -264,7 +264,7 @@ void UniDirectionalLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
   // DumpMatrix("Input", inputs.data(), seq_length_, batch_size_ * input_size_);
 
   // Calculate the max and min length
-  const auto min_max_pair = std::minmax_element(sequence_lengths.cbegin(), sequence_lengths.cend());
+  const auto min_max_pair = std::minmax_element(sequence_lengths.begin(), sequence_lengths.end());
   int max_sequence_length = *min_max_pair.second;
   int min_sequence_length = std::min(seq_length_, *min_max_pair.first);
 
@@ -278,10 +278,10 @@ void UniDirectionalLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
   AllocateQuantizeBuffers<WeightT>(max_sequence_length);
 
   // apply the weights to all the inputs and save to output_IOFC
-  ComputeGemm(total_rows, hidden_size_x4, input_size_, alpha, inputs.cbegin(), inputs.cend(),
+  ComputeGemm(total_rows, hidden_size_x4, input_size_, alpha, inputs,
               input_weights,
-              beta, output_iofc_.begin(), output_iofc_.end(), hidden_size_x4,
-              quantized_input_or_a_.begin(),
+              beta, output_iofc_, hidden_size_x4,
+              quantized_input_or_a_.data(),
               nullptr,
               thread_pool_);
 
@@ -305,7 +305,7 @@ void UniDirectionalLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
 
   // lambda to do all processing on num_seq_to_compute sequences
   auto sequences_calculator = [&](int seq_start, onnxruntime::concurrency::ThreadPool* ttp) {
-    span_T_const_iter previous_state_end = batched_hidden_state_one_step.cend();
+    auto previous_state_end = batched_hidden_state_one_step.end();
 
     // handling boundaries
     int num_seq_to_compute_adjusted = num_seq_to_compute;
@@ -318,7 +318,7 @@ void UniDirectionalLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
 
     // hidden state can be provided as input for first step, so need to special case that.
     // after the first step this will switch to the output from the previous step
-    span_T_const_iter previous_state = batched_hidden_state_one_step.cbegin() + seq_start * hidden_size_;
+    auto previous_state = batched_hidden_state_one_step.begin() + seq_start * hidden_size_;
 
     // run through steps sequentially
     for (int step = 0; step < max_sequence_length; step++) {
@@ -331,12 +331,12 @@ void UniDirectionalLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
       // calculate Xt*(W[iofc]^T) + Ht-t*R[iofc]
       // Do it sequentially to avoid nested parallelism
       ComputeGemm(num_seq_to_compute_adjusted, hidden_size_x4, hidden_size_, alpha,
-                  previous_state, previous_state_end,       // Ht-1
-                  recurrent_weights,                        // R[iofc]
-                  beta, step_out_IOFC, output_iofc_.end(),  // input contains Xt*(W[iofc]^T)
+                  gsl::span<const T>(&*previous_state, previous_state_end - previous_state),  // Ht-1
+                  recurrent_weights,                                                          // R[iofc]
+                  beta, gsl::span<T>(&*step_out_IOFC, output_iofc_.end() - step_out_IOFC),    // input contains Xt*(W[iofc]^T)
                   hidden_size_x4,
-                  quantized_input_or_a_.begin() + (seq_start * hidden_size_),
-                  quantized_C_buffer_.begin() + (seq_start * hidden_size_x4),
+                  quantized_input_or_a_.data() + (seq_start * hidden_size_),
+                  quantized_C_buffer_.data() + (seq_start * hidden_size_x4),
                   ttp);
 
       DumpMatrix("Xt*(W[iofc]^T) + Ht-t*R[iofc]" + row_str, &*step_out_IOFC, num_seq_to_compute_adjusted, hidden_size_x4);
diff --git a/onnxruntime/core/providers/cpu/rnn/uni_directional_lstm.h b/onnxruntime/core/providers/cpu/rnn/uni_directional_lstm.h
index 9eb71112d2..1609449030 100644
--- a/onnxruntime/core/providers/cpu/rnn/uni_directional_lstm.h
+++ b/onnxruntime/core/providers/cpu/rnn/uni_directional_lstm.h
@@ -40,7 +40,6 @@ class UniDirectionalLstm {
   ~UniDirectionalLstm() = default;
 
  private:
-  using span_T_const_iter = typename gsl::span<T>::const_iterator;
   using span_T_iter = typename gsl::span<T>::iterator;
 
   void SetNumThreads();
diff --git a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
index d3fdcaf351..32e47dcd3a 100644
--- a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
+++ b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
@@ -2,6 +2,8 @@
 // Licensed under the MIT License.
 
 #include "core/providers/cpu/sequence/sequence_ops.h"
+
+#include "core/common/narrow.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/TensorSeq.h"
 #include "core/framework/op_kernel_type_control_utils.h"
@@ -381,11 +383,11 @@ Status SplitToSequence::PrepareForCompute(const TensorShape& input_shape, int64_
   axis = HandleNegativeAxis(axis_, num_dimensions);  // handle negative and enforce axis is valid
   const int64_t split_dim_size = input_dims[axis];
 
-  before_dims = gsl::narrow<int>(input_shape.SizeToDimension(axis));
-  after_dims_including_split_axis = gsl::narrow<int>(input_shape.SizeFromDimension(axis));
+  before_dims = narrow<int>(input_shape.SizeToDimension(axis));
+  after_dims_including_split_axis = narrow<int>(input_shape.SizeFromDimension(axis));
   after_dims_excluding_split = (axis + 1 == num_dimensions)
                                    ? 1  // we multiply by this value so must be 1 not 0
-                                   : gsl::narrow<int>(input_shape.SizeFromDimension(axis + 1));
+                                   : narrow<int>(input_shape.SizeFromDimension(axis + 1));
 
   if (is_split_input_scalar) {
     auto num_even_splits = split_dim_size / split_scalar;
@@ -513,7 +515,7 @@ Status SplitToSequence::ComputeImpl(OpKernelContext& context, const Tensor& inpu
     if (is_uneven_split && i == num_outputs - 1) {  // only for the last output that has a size different from the rest
       split_size = num_remaining_splits;
     } else {
-      split_size = gsl::narrow<int>(split_sizes[i]);
+      split_size = narrow<int>(split_sizes[i]);
     }
     output_dimensions[axis] = split_size;
 
diff --git a/onnxruntime/core/providers/cpu/tensor/cast_op.cc b/onnxruntime/core/providers/cpu/tensor/cast_op.cc
index 4c23a15a6c..7469f06262 100644
--- a/onnxruntime/core/providers/cpu/tensor/cast_op.cc
+++ b/onnxruntime/core/providers/cpu/tensor/cast_op.cc
@@ -7,9 +7,10 @@
 
 #include "boost/mp11.hpp"
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "core/common/common.h"
+#include "core/common/narrow.h"
 #include "core/common/type_list.h"
 #include "core/framework/data_types_internal.h"
 #include "core/framework/data_types.h"
@@ -174,7 +175,7 @@ struct TensorCaster {
     using SrcEigenCastType = typename EigenCastType<SrcType>::type;
     using DstEigenCastType = typename EigenCastType<DstType>::type;
 
-    const std::ptrdiff_t shape_size = gsl::narrow<std::ptrdiff_t>(shape.Size());
+    const std::ptrdiff_t shape_size = narrow<std::ptrdiff_t>(shape.Size());
     const auto in_vector =
         ConstEigenVectorMap<SrcEigenCastType>(reinterpret_cast<const SrcEigenCastType*>(in.Data<SrcType>()), shape_size);
     auto out_vector =
@@ -187,7 +188,7 @@ struct TensorCaster {
 template <typename SrcType>
 struct TensorCaster<SrcType, std::string> {
   void Cast(const OpKernelContext&, const TensorShape& shape, const Tensor& in, Tensor& out) const {
-    const std::ptrdiff_t shape_size = gsl::narrow<std::ptrdiff_t>(shape.Size());
+    const std::ptrdiff_t shape_size = narrow<std::ptrdiff_t>(shape.Size());
     const auto* in_data = in.Data<SrcType>();
     auto* out_data = out.MutableData<std::string>();
     for (std::ptrdiff_t i = 0; i < shape_size; ++i) {
@@ -200,7 +201,7 @@ struct TensorCaster<SrcType, std::string> {
 template <typename DstType>
 struct TensorCaster<std::string, DstType> {
   void Cast(const OpKernelContext&, const TensorShape& shape, const Tensor& in, Tensor& out) const {
-    const std::ptrdiff_t shape_size = gsl::narrow<std::ptrdiff_t>(shape.Size());
+    const std::ptrdiff_t shape_size = narrow<std::ptrdiff_t>(shape.Size());
     const auto* in_data = in.Data<std::string>();
     auto* out_data = out.MutableData<DstType>();
     for (std::ptrdiff_t i = 0; i < shape_size; ++i) {
@@ -219,7 +220,7 @@ struct TensorCaster<MLFloat16, float> {
   void Cast(const OpKernelContext&, const TensorShape& shape, const Tensor& in, Tensor& out) const {
     auto out_data = out.MutableData<float>();
     auto in_data = in.Data<MLFloat16>();
-    const size_t shape_size = gsl::narrow<size_t>(shape.Size());
+    const size_t shape_size = narrow<size_t>(shape.Size());
     MlasConvertHalfToFloatBuffer(&in_data[0].val, out_data, shape_size);
   }
 };
diff --git a/onnxruntime/core/providers/cpu/tensor/gather.cc b/onnxruntime/core/providers/cpu/tensor/gather.cc
index ad444d3d8f..9991f428c3 100644
--- a/onnxruntime/core/providers/cpu/tensor/gather.cc
+++ b/onnxruntime/core/providers/cpu/tensor/gather.cc
@@ -2,9 +2,10 @@
 // Licensed under the MIT License.
 
 //https://github.com/onnx/onnx/blob/main/docs/Operators.md#Gather
-#include <core/common/safeint.h>
 #include "core/providers/cpu/tensor/gather.h"
 #include "core/common/common.h"
+#include "core/common/narrow.h"
+#include "core/common/safeint.h"
 #include "core/framework/op_kernel_type_control_utils.h"
 #include "core/platform/threadpool.h"
 #include "core/providers/op_kernel_type_control.h"
@@ -62,20 +63,20 @@ Status GatherBase::PrepareForCompute(OpKernelContext* context, Prepare& p) const
   const TensorShape& indices_shape = p.indices_tensor->Shape();
 
   const auto input_rank = input_data_shape.NumDimensions();
-  p.axis = HandleNegativeAxis(axis_, gsl::narrow<int64_t>(input_rank));
+  p.axis = HandleNegativeAxis(axis_, narrow<int64_t>(input_rank));
 
   std::vector<int64_t> shape;
   shape.reserve(input_rank - 1 + indices_shape.NumDimensions());
 
   // replace the dimension for p.axis with the shape from the indices
   for (int64_t i = 0; i < p.axis; ++i)
-    shape.push_back(input_data_shape[gsl::narrow<size_t>(i)]);
+    shape.push_back(input_data_shape[narrow<size_t>(i)]);
 
   for (const auto dim : indices_shape.GetDims())
     shape.push_back(dim);
 
   for (int64_t i = p.axis + 1; i < static_cast<int64_t>(input_rank); ++i)
-    shape.push_back(input_data_shape[gsl::narrow<size_t>(i)]);
+    shape.push_back(input_data_shape[narrow<size_t>(i)]);
 
   p.output_tensor = context->Output(0, TensorShape(std::move(shape)));
 
@@ -90,7 +91,7 @@ Status GatherCopyData(const Tensor* indices_tensor, const uint8_t* src_base, uin
   const Tin* indices_data = indices_tensor->Data<Tin>();
 
   // Check the indices first in case there's a out of bound index.
-  auto axis_dim_limit = input_data_shape[gsl::narrow<size_t>(axis)];
+  auto axis_dim_limit = input_data_shape[narrow<size_t>(axis)];
 
   for (int64_t i = 0; i < N; ++i) {
     Tin idx = indices_data[i];
@@ -116,7 +117,7 @@ Status GatherCopyData(const Tensor* indices_tensor, const uint8_t* src_base, uin
       reinterpret_cast<std::string*>(dst_base)[dst_offset / element_bytes] =
           reinterpret_cast<const std::string*>(src_base)[src_offset / element_bytes];
     } else {
-      memcpy(dst_base + dst_offset, src_base + src_offset, gsl::narrow<size_t>(block_size));
+      memcpy(dst_base + dst_offset, src_base + src_offset, narrow<size_t>(block_size));
     }
   };
   concurrency::ThreadPool::TryParallelFor(tp, SafeInt<ptrdiff_t>(M) * N, static_cast<double>(block_size),
@@ -140,9 +141,9 @@ Status Gather::Compute(OpKernelContext* context) const {
   const size_t element_bytes = p.input_tensor->DataType()->Size();
   const int64_t block = input_data_shape.SizeFromDimension(SafeInt<size_t>(p.axis) + 1);
   const int64_t block_size = SafeInt<int64_t>(element_bytes) * block ;
-  const int64_t M = input_data_shape.SizeToDimension(gsl::narrow<size_t>(p.axis));
+  const int64_t M = input_data_shape.SizeToDimension(narrow<size_t>(p.axis));
   const int64_t N = p.indices_tensor->Shape().Size();
-  const int64_t data_batch_bytes = input_data_shape.SizeFromDimension(gsl::narrow<size_t>(p.axis)) * element_bytes;
+  const int64_t data_batch_bytes = input_data_shape.SizeFromDimension(narrow<size_t>(p.axis)) * element_bytes;
   const int64_t gathered_batch_bytes = N * block * SafeInt<int64_t>(element_bytes);
 
   const auto* src_base = static_cast<const uint8_t*>(p.input_tensor->DataRaw());
diff --git a/onnxruntime/core/providers/cpu/tensor/mean_variance_normalization.h b/onnxruntime/core/providers/cpu/tensor/mean_variance_normalization.h
index 0259718766..a84941e3e3 100644
--- a/onnxruntime/core/providers/cpu/tensor/mean_variance_normalization.h
+++ b/onnxruntime/core/providers/cpu/tensor/mean_variance_normalization.h
@@ -7,7 +7,7 @@
 #include "core/framework/op_kernel.h"
 #include "core/util/math_cpuonly.h"
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 namespace onnxruntime {
 template <typename T>
 class MeanVarianceNormalization_0 : public OpKernel {
diff --git a/onnxruntime/core/providers/cpu/tensor/padbase.h b/onnxruntime/core/providers/cpu/tensor/padbase.h
index 3ae68bba18..a3a0d7adc6 100644
--- a/onnxruntime/core/providers/cpu/tensor/padbase.h
+++ b/onnxruntime/core/providers/cpu/tensor/padbase.h
@@ -50,7 +50,7 @@ class PadBase {
       gsl::span<const int64_t> pads_span;
       if (!info.GetAttrsAsSpan("pads", pads_span).IsOK())
         ORT_THROW("Invalid 'pads' attribute value");
-      pads_.assign(pads_span.cbegin(), pads_span.cend());
+      pads_.assign(pads_span.begin(), pads_span.end());
       // Separate out any negative pads_ into the slices_ array
       slices_.resize(pads_.size(), 0);
       for (size_t index = 0; index < pads_.size(); index++) {
diff --git a/onnxruntime/core/providers/cpu/tensor/reshape.h b/onnxruntime/core/providers/cpu/tensor/reshape.h
index 140aae772d..d59d4558ae 100644
--- a/onnxruntime/core/providers/cpu/tensor/reshape.h
+++ b/onnxruntime/core/providers/cpu/tensor/reshape.h
@@ -6,7 +6,6 @@
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
 #include "core/framework/tensor.h"
-#include "gsl/gsl"
 #include "reshape_helper.h"
 #include "utils.h"
 
diff --git a/onnxruntime/core/providers/cpu/tensor/reverse_sequence.cc b/onnxruntime/core/providers/cpu/tensor/reverse_sequence.cc
index 81aa292790..fa0bd02ee5 100644
--- a/onnxruntime/core/providers/cpu/tensor/reverse_sequence.cc
+++ b/onnxruntime/core/providers/cpu/tensor/reverse_sequence.cc
@@ -11,7 +11,7 @@
 #pragma warning(disable : 4996)
 #endif
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #ifdef _MSC_VER
 #pragma warning(pop)
diff --git a/onnxruntime/core/providers/cpu/tensor/scatter.cc b/onnxruntime/core/providers/cpu/tensor/scatter.cc
index e113aae4d8..1701b2beed 100644
--- a/onnxruntime/core/providers/cpu/tensor/scatter.cc
+++ b/onnxruntime/core/providers/cpu/tensor/scatter.cc
@@ -5,9 +5,8 @@
 #include <type_traits>
 #include <core/common/safeint.h>
 
-#include "gsl/gsl"
-
 #include "core/common/common.h"
+#include "core/common/narrow.h"
 #include "core/framework/element_type_lists.h"
 #include "core/framework/op_kernel.h"
 #include "core/framework/op_kernel_type_control_utils.h"
@@ -174,10 +173,10 @@ Status GetIndices(
   const auto& input_data_shape = data_input.Shape();
   const auto* indices_data_raw = indices_input.Data<TIndex>();
   const auto num_indices = indices_input.Shape().Size();
-  const auto axis_dim_limit = input_data_shape[gsl::narrow<size_t>(axis)];
+  const auto axis_dim_limit = input_data_shape[narrow<size_t>(axis)];
 
   std::vector<int64_t> indices_data_result;
-  indices_data_result.reserve(gsl::narrow<size_t>(num_indices));
+  indices_data_result.reserve(narrow<size_t>(num_indices));
 
   for (int64_t i = 0; i < num_indices; ++i) {
     const int64_t idx = static_cast<int64_t>(indices_data_raw[i]);
@@ -206,7 +205,7 @@ Status ScatterData(
   const auto input_elements = input_data_shape.Size();
   const auto total_input_bytes = data_input->SizeInBytes();
 
-  const auto num_indices = gsl::narrow<int64_t>(indices_data.size());
+  const auto num_indices = narrow<int64_t>(indices_data.size());
 
   const auto* src_base = static_cast<const Tdata*>(data_input->DataRaw());
   auto* dst_base = static_cast<Tdata*>(data_output->MutableDataRaw());
@@ -265,14 +264,14 @@ Status ScatterData(
     // We start at num_dims - 2 because we already pre-populated
     // the last element above
     for (auto i = int64_t(num_dims - 2); i >= 0; --i) {
-      dim_block_size[gsl::narrow<size_t>(i)] = input_data_shape[SafeInt<size_t>(i) + 1] * dim_block_size[SafeInt<size_t>(i) + 1];
+      dim_block_size[narrow<size_t>(i)] = input_data_shape[SafeInt<size_t>(i) + 1] * dim_block_size[SafeInt<size_t>(i) + 1];
     }
   }
 
   const auto* update_data = static_cast<const Tdata*>(updates_input->DataRaw());
   // For every update we compute the destination offset and copy it there
   for (int64_t index = 0; index < num_indices;) {
-    const auto axis_idx = indices_data[gsl::narrow<size_t>(index)];
+    const auto axis_idx = indices_data[narrow<size_t>(index)];
 
     // Compute the offset
     // See comments above for dim_block_size
@@ -280,9 +279,9 @@ Status ScatterData(
     for (size_t i = 0; i < num_dims; ++i) {
       if (i == size_t(axis)) {
         // replace the counter with the update index for this dim
-        dst_offset += gsl::narrow<size_t>(axis_idx * dim_block_size[gsl::narrow<size_t>(i)]);
+        dst_offset += narrow<size_t>(axis_idx * dim_block_size[narrow<size_t>(i)]);
       } else {
-        dst_offset += gsl::narrow<size_t>(dim_counters[gsl::narrow<size_t>(i)] * dim_block_size[gsl::narrow<size_t>(i)]);
+        dst_offset += narrow<size_t>(dim_counters[narrow<size_t>(i)] * dim_block_size[narrow<size_t>(i)]);
       }
     }
 
@@ -294,15 +293,15 @@ Status ScatterData(
     // Increment counters
     // See comments for dim_counters above
     for (auto i = int64_t(num_dims - 1); i >= 0; --i) {
-      auto v = ++dim_counters[gsl::narrow<size_t>(i)];
-      assert(v <= upd_shape[gsl::narrow<size_t>(i)]);
-      if (v < upd_shape[gsl::narrow<size_t>(i)]) {
+      auto v = ++dim_counters[narrow<size_t>(i)];
+      assert(v <= upd_shape[narrow<size_t>(i)]);
+      if (v < upd_shape[narrow<size_t>(i)]) {
         // No carry, done
         break;
       }
       // No carry for the most significant dim
       assert(i > 0);
-      dim_counters[gsl::narrow<size_t>(i)] = 0;
+      dim_counters[narrow<size_t>(i)] = 0;
     }
   }
   return Status::OK();
diff --git a/onnxruntime/core/providers/cpu/tensor/shape_op.h b/onnxruntime/core/providers/cpu/tensor/shape_op.h
index ac0b9f8f6c..877a83df55 100644
--- a/onnxruntime/core/providers/cpu/tensor/shape_op.h
+++ b/onnxruntime/core/providers/cpu/tensor/shape_op.h
@@ -8,7 +8,7 @@
 #include "core/framework/op_kernel.h"
 #endif
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include <limits>
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/cpu/tensor/slice.cc b/onnxruntime/core/providers/cpu/tensor/slice.cc
index 4b8dc31740..602762c0fd 100644
--- a/onnxruntime/core/providers/cpu/tensor/slice.cc
+++ b/onnxruntime/core/providers/cpu/tensor/slice.cc
@@ -6,6 +6,7 @@
 #include <limits>
 #include <unordered_map>
 
+#include "core/common/narrow.h"
 #include "core/framework/element_type_lists.h"
 #include "core/framework/op_kernel_type_control_utils.h"
 #include "core/providers/common.h"
@@ -83,7 +84,7 @@ static void FlattenOutputDims(gsl::span<const int64_t> input_dimensions,
   int num_to_combine = 0;
   for (int64_t i = static_cast<int64_t>(starts.size()) - 1; i >= 0; --i) {
     // if we're keeping all the data for the dimension and not reversing the direction we can potentially combine it
-    if (steps[gsl::narrow<size_t>(i)] == 1 && input_dimensions[gsl::narrow<size_t>(i)] == output_dims[gsl::narrow<size_t>(i)])
+    if (steps[narrow<size_t>(i)] == 1 && input_dimensions[narrow<size_t>(i)] == output_dims[narrow<size_t>(i)])
       ++num_to_combine;
     else
       break;
@@ -91,7 +92,7 @@ static void FlattenOutputDims(gsl::span<const int64_t> input_dimensions,
 
   if (num_to_combine > 1) {
     auto num_dims = output_dims.size() - num_to_combine + 1;
-    flattened_output_dims->assign(output_dims.cbegin(), output_dims.cend());
+    flattened_output_dims->assign(output_dims.begin(), output_dims.end());
     flattened_output_dims->resize(num_dims);
 
     int64_t dim_value = 1;
@@ -149,17 +150,17 @@ void CopyData(const Tensor& start_tensor,
               TensorShapeVector& input_axes,
               TensorShapeVector& input_steps) {
   auto start_data = start_tensor.DataAsSpan<T>();
-  std::copy(start_data.cbegin(), start_data.cend(), std::back_inserter(input_starts));
+  std::copy(start_data.begin(), start_data.end(), std::back_inserter(input_starts));
   auto ends_data = ends_tensor.DataAsSpan<T>();
-  std::copy(ends_data.cbegin(), ends_data.cend(), std::back_inserter(input_ends));
+  std::copy(ends_data.begin(), ends_data.end(), std::back_inserter(input_ends));
   if (nullptr != axes_tensor) {
     auto axes_data = axes_tensor->DataAsSpan<T>();
-    std::copy(axes_data.cbegin(), axes_data.cend(), std::back_inserter(input_axes));
+    std::copy(axes_data.begin(), axes_data.end(), std::back_inserter(input_axes));
   }
   // Slice V10
   if (nullptr != steps_tensor) {
     auto steps_data = steps_tensor->DataAsSpan<T>();
-    std::copy(steps_data.cbegin(), steps_data.cend(), std::back_inserter(input_steps));
+    std::copy(steps_data.begin(), steps_data.end(), std::back_inserter(input_steps));
   }
 }
 }  // namespace
@@ -182,13 +183,13 @@ Status SliceBase::FillVectorsFromInput(const Tensor& start_tensor,
                     "Starts and steps shape mismatch");
 
   const auto size = start_tensor.Shape().Size();
-  input_starts.reserve(gsl::narrow<size_t>(size));
-  input_ends.reserve(gsl::narrow<size_t>(size));
+  input_starts.reserve(narrow<size_t>(size));
+  input_ends.reserve(narrow<size_t>(size));
   if (nullptr != axes_tensor)
-    input_axes.reserve(gsl::narrow<size_t>(size));
+    input_axes.reserve(narrow<size_t>(size));
   // Slice V10
   if (nullptr != steps_tensor)
-    input_steps.reserve(gsl::narrow<size_t>(size));
+    input_steps.reserve(narrow<size_t>(size));
 
   // check for type reduction of supported indices types
   constexpr bool int32_enabled = utils::HasType<EnabledIndicesTypes, int32_t>();
diff --git a/onnxruntime/core/providers/cpu/tensor/slice_compute_metadata.h b/onnxruntime/core/providers/cpu/tensor/slice_compute_metadata.h
index 5f80bd1033..3b2908a1c9 100644
--- a/onnxruntime/core/providers/cpu/tensor/slice_compute_metadata.h
+++ b/onnxruntime/core/providers/cpu/tensor/slice_compute_metadata.h
@@ -6,7 +6,7 @@
 
 #include <cstdint>
 #include <vector>
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 #include "core/framework/tensor_shape.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/cpu/tensor/slice_helper.h b/onnxruntime/core/providers/cpu/tensor/slice_helper.h
index 22d8906af2..0f54e12795 100644
--- a/onnxruntime/core/providers/cpu/tensor/slice_helper.h
+++ b/onnxruntime/core/providers/cpu/tensor/slice_helper.h
@@ -27,7 +27,7 @@ inline Status PrepareForComputeHelper(const gsl::span<const int64_t>& raw_starts
     }
   } else {
     axes.reserve(raw_axes.size());
-    axes.assign(raw_axes.cbegin(), raw_axes.cend());
+    axes.assign(raw_axes.begin(), raw_axes.end());
   }
 
   // Iterate through the provided axes and override the start/end ranges
@@ -86,7 +86,7 @@ inline Status PrepareForComputeHelper(const gsl::span<const int64_t>& raw_starts
       axes.push_back(i);
     }
   } else {
-    axes.assign(raw_axes.cbegin(), raw_axes.cend());
+    axes.assign(raw_axes.begin(), raw_axes.end());
   }
 
   // Iterate through the provided axes and override the start/end/steps ranges
diff --git a/onnxruntime/core/providers/cpu/tensor/split.cc b/onnxruntime/core/providers/cpu/tensor/split.cc
index 5032f230b6..d058deb9d3 100644
--- a/onnxruntime/core/providers/cpu/tensor/split.cc
+++ b/onnxruntime/core/providers/cpu/tensor/split.cc
@@ -3,8 +3,9 @@
 
 #include "core/providers/cpu/tensor/split.h"
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
+#include "core/common/narrow.h"
 #include "core/framework/op_kernel_type_control_utils.h"
 #include "core/providers/common.h"
 #include "core/providers/op_kernel_type_control.h"
@@ -58,11 +59,11 @@ Status SplitBase::PrepareForCompute(const TensorShape& input_shape, int num_outp
   axis = HandleNegativeAxis(axis_, num_dimensions);  // handle negative and enforce axis is valid
   const int64_t split_dim_size = input_dims[axis];
 
-  before_dims = gsl::narrow<int>(input_shape.SizeToDimension(axis));
-  after_dims_including_split_axis = gsl::narrow<int>(input_shape.SizeFromDimension(axis));
+  before_dims = narrow<int>(input_shape.SizeToDimension(axis));
+  after_dims_including_split_axis = narrow<int>(input_shape.SizeFromDimension(axis));
   after_dims_excluding_split = (axis + 1 == num_dimensions)
                                    ? 1  // we multiply by this value so must be 1 not 0
-                                   : gsl::narrow<int>(input_shape.SizeFromDimension(axis + 1));
+                                   : narrow<int>(input_shape.SizeFromDimension(axis + 1));
 
   if (split_sizes.empty()) {
     // equal split based on number of outputs
@@ -164,7 +165,7 @@ Status Split::ComputeImpl(OpKernelContext& context, const Tensor& input) const {
 
   for (int i = 0; i < num_outputs; ++i) {
     // update size of dimension for axis we're splitting on
-    auto split_size = gsl::narrow<int>(split_sizes[i]);
+    auto split_size = narrow<int>(split_sizes[i]);
     output_dimensions[axis] = split_size;
 
     Tensor* output = context.Output(i, TensorShape{output_dimensions});
diff --git a/onnxruntime/core/providers/cpu/tensor/tile.cc b/onnxruntime/core/providers/cpu/tensor/tile.cc
index 8292296d67..261866bc84 100644
--- a/onnxruntime/core/providers/cpu/tensor/tile.cc
+++ b/onnxruntime/core/providers/cpu/tensor/tile.cc
@@ -9,7 +9,6 @@
 #pragma warning(disable : 4996)
 #endif
 
-#include "gsl/gsl"
 #include "core/providers/cpu/tensor/tile.h"
 #include "core/providers/cpu/tensor/utils.h"
 
diff --git a/onnxruntime/core/providers/cpu/tensor/transpose.h b/onnxruntime/core/providers/cpu/tensor/transpose.h
index 7672e093e4..c88c318dda 100644
--- a/onnxruntime/core/providers/cpu/tensor/transpose.h
+++ b/onnxruntime/core/providers/cpu/tensor/transpose.h
@@ -10,7 +10,7 @@
 #include "core/framework/op_kernel.h"
 #endif
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include <sstream>
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/cpu/tensor/unique.cc b/onnxruntime/core/providers/cpu/tensor/unique.cc
index bcd4102078..8ace7c7281 100644
--- a/onnxruntime/core/providers/cpu/tensor/unique.cc
+++ b/onnxruntime/core/providers/cpu/tensor/unique.cc
@@ -4,7 +4,7 @@
 #include "core/providers/cpu/tensor/unique.h"
 
 #include <map>
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "core/framework/op_kernel_type_control_utils.h"
 #include "core/providers/common.h"
 #include "core/providers/op_kernel_type_control.h"
diff --git a/onnxruntime/core/providers/cpu/tensor/unsqueeze.cc b/onnxruntime/core/providers/cpu/tensor/unsqueeze.cc
index ac6f37a353..de4125763a 100644
--- a/onnxruntime/core/providers/cpu/tensor/unsqueeze.cc
+++ b/onnxruntime/core/providers/cpu/tensor/unsqueeze.cc
@@ -50,7 +50,7 @@ Status UnsqueezeBase::PrepareCompute(OpKernelContext* ctx, Prepare& p) const {
                     axes_tensor->Shape().NumDimensions() == 1,
                 "An axes tensor must be a scalar or a 1-D tensor.");
     auto data_span = axes_tensor->template DataAsSpan<int64_t>();
-    axes.assign(data_span.cbegin(), data_span.cend());
+    axes.assign(data_span.begin(), data_span.end());
   } else {
     axes.assign(axes_.begin(), axes_.end());
   }
@@ -72,12 +72,12 @@ Status UnsqueezeBase::PrepareCompute(OpKernelContext* ctx, Prepare& p) const {
 
   // Now fill in the zero entries with the existing shape
   {
-    auto begin = input_tensor.Shape().GetDims().cbegin();
+    auto begin = input_tensor.Shape().GetDims().begin();
     for (auto& axisSize : output_dims) {
       if (axisSize == 0)
         axisSize = *begin++;
     }
-    assert(begin == input_tensor.Shape().GetDims().cend());
+    assert(begin == input_tensor.Shape().GetDims().end());
   }
 
   TensorShape output_shape(output_dims);
diff --git a/onnxruntime/core/providers/cpu/tensor/utils.h b/onnxruntime/core/providers/cpu/tensor/utils.h
index fd81045405..2ff1e1bb16 100644
--- a/onnxruntime/core/providers/cpu/tensor/utils.h
+++ b/onnxruntime/core/providers/cpu/tensor/utils.h
@@ -2,7 +2,8 @@
 // Licensed under the MIT License.
 
 #pragma once
-#include "gsl/gsl"
+#include "core/common/gsl.h"
+#include "core/common/narrow.h"
 
 #ifndef SHARED_PROVIDER
 #include "core/framework/utils.h"
@@ -103,7 +104,7 @@ struct ExtentAxisCounters {
     axis_ = indices_.size();
 
     // If a tensor has a shape, but one of the axes is 0 in size, there are no elements, so nothing to iterate
-    if (std::find(extents.cbegin(), extents.cend(), 0) != extents.cend())
+    if (std::find(extents.begin(), extents.end(), 0) != extents.end())
       running_ = false;
   }
 
@@ -207,7 +208,7 @@ struct SliceIteratorBase {
     last_batching_axis_ = dims_size - 1;
 
     // Check if inner dimension is copied as a block in its entirety
-    if (dims_size > 1 && inner_step_ == 1 && inner_extent_ == gsl::narrow<size_t>(dims[dims_size - 1])) {
+    if (dims_size > 1 && inner_step_ == 1 && inner_extent_ == narrow<size_t>(dims[dims_size - 1])) {
       for (size_t dim = dims_size - 2;; dim--) {
         if (dim < steps_size && steps[dim] != 1) {
           break;
diff --git a/onnxruntime/core/providers/cpu/tensor/where_op.cc b/onnxruntime/core/providers/cpu/tensor/where_op.cc
index d87ade8c2f..19bf3cf3c8 100644
--- a/onnxruntime/core/providers/cpu/tensor/where_op.cc
+++ b/onnxruntime/core/providers/cpu/tensor/where_op.cc
@@ -108,7 +108,7 @@ ProcessBroadcastSpanFuncs CreateNonScalarBroadcastFuncs() {
         auto value = per_iter_bh.SpanInput1<T>();
         auto output = per_iter_bh.OutputSpan<T>();
         if (condition == target) {
-          std::copy(value.cbegin(), value.cend(), output.begin());
+          std::copy(value.begin(), value.end(), output.begin());
         } else {
           std::fill(output.begin(), output.end(), T{});
         }
@@ -118,7 +118,7 @@ ProcessBroadcastSpanFuncs CreateNonScalarBroadcastFuncs() {
         auto condition = per_iter_bh.SpanInput0<bool>();
         const T& value = per_iter_bh.ScalarInput1<T>();
         auto output = per_iter_bh.OutputSpan<T>();
-        std::transform(condition.cbegin(), condition.cend(), output.begin(),
+        std::transform(condition.begin(), condition.end(), output.begin(),
                        [target, &value](bool condition_element) {
                          return condition_element == target ? value : T{};
                        });
@@ -128,7 +128,7 @@ ProcessBroadcastSpanFuncs CreateNonScalarBroadcastFuncs() {
         auto condition = per_iter_bh.SpanInput0<bool>();
         auto value = per_iter_bh.SpanInput1<T>();
         auto output = per_iter_bh.OutputSpan<T>();
-        std::transform(condition.cbegin(), condition.cend(), value.cbegin(), output.begin(),
+        std::transform(condition.begin(), condition.end(), value.begin(), output.begin(),
                        [target](bool condition_element, const T& value_element) {
                          return condition_element == target ? value_element : T{};
                        });
@@ -185,7 +185,7 @@ void MergeScalarAndVector(gsl::span<T> output, const T& scalar_value, gsl::span<
   if (!scalar_value.empty()) {
     std::fill(output.begin(), output.end(), scalar_value);
   } else {
-    std::copy(vector_value.cbegin(), vector_value.cend(), output.begin());
+    std::copy(vector_value.begin(), vector_value.end(), output.begin());
   }
 };
 
@@ -206,7 +206,7 @@ EnableIfEigenNotScalar<T, ProcessBroadcastSpanFuncs> MergeBroadcastFuncs() {
         auto X_selection = per_iter_bh.SpanInput0<T>();
         auto Y_selection = per_iter_bh.SpanInput1<T>();
         auto output = per_iter_bh.OutputSpan<T>();
-        std::transform(X_selection.cbegin(), X_selection.cend(), Y_selection.cbegin(), output.begin(),
+        std::transform(X_selection.begin(), X_selection.end(), Y_selection.begin(), output.begin(),
                        [](const T& x, const T& y) { return !x.empty() ? x : y; });
       }};
 }
diff --git a/onnxruntime/core/providers/cuda/controlflow/if.h b/onnxruntime/core/providers/cuda/controlflow/if.h
index f182bbeba2..4075a0b3d2 100644
--- a/onnxruntime/core/providers/cuda/controlflow/if.h
+++ b/onnxruntime/core/providers/cuda/controlflow/if.h
@@ -3,7 +3,6 @@
 
 #pragma once
 #include <functional>
-#include "gsl/gsl"
 
 #include "core/common/common.h"
 #include "core/providers/cpu/controlflow/if.h"
diff --git a/onnxruntime/core/providers/cuda/controlflow/scan.h b/onnxruntime/core/providers/cuda/controlflow/scan.h
index 5b8fbb3ce8..21b9f50d78 100644
--- a/onnxruntime/core/providers/cuda/controlflow/scan.h
+++ b/onnxruntime/core/providers/cuda/controlflow/scan.h
@@ -3,7 +3,6 @@
 
 #pragma once
 #include <functional>
-#include "gsl/gsl"
 
 #include "core/providers/cpu/controlflow/scan.h"
 
diff --git a/onnxruntime/core/providers/cuda/cuda_common.h b/onnxruntime/core/providers/cuda/cuda_common.h
index 7c7103a14d..1176ea2713 100644
--- a/onnxruntime/core/providers/cuda/cuda_common.h
+++ b/onnxruntime/core/providers/cuda/cuda_common.h
@@ -9,7 +9,7 @@
 #include "core/providers/cuda/cuda_pch.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
 #include "core/providers/cuda/shared_inc/fast_divmod.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
 namespace cuda {
diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
index 76d06368cc..146b06d77f 100644
--- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
+++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
@@ -9,7 +9,7 @@
 #include <memory>
 #include <chrono>
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "core/providers/cuda/cuda_execution_provider.h"
 #include "core/providers/cuda/cuda_execution_provider_info.h"
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc
index 4932f3d2e8..d62a651880 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.cc
+++ b/onnxruntime/core/providers/cuda/cudnn_common.cc
@@ -3,7 +3,7 @@
 
 #include "cudnn_common.h"
 #include "core/common/inlined_containers.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "shared_inc/cuda_call.h"
 #include "core/providers/cpu/tensor/utils.h"
 
diff --git a/onnxruntime/core/providers/cuda/math/softmax.h b/onnxruntime/core/providers/cuda/math/softmax.h
index 6ed25c306b..f219e963fb 100644
--- a/onnxruntime/core/providers/cuda/math/softmax.h
+++ b/onnxruntime/core/providers/cuda/math/softmax.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "core/providers/cuda/cuda_kernel.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/cuda/multi_tensor/common.cuh b/onnxruntime/core/providers/cuda/multi_tensor/common.cuh
index 85e7e218bb..152b340756 100644
--- a/onnxruntime/core/providers/cuda/multi_tensor/common.cuh
+++ b/onnxruntime/core/providers/cuda/multi_tensor/common.cuh
@@ -10,7 +10,7 @@
 #include <vector>
 
 #include "core/common/common.h"
-#include "gsl/gsl-lite.hpp"
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
 namespace cuda {
diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc
index fd0d15640f..159d7c1990 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv.cc
@@ -1,8 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/nn/conv.h"
+#include "core/common/span_utils.h"
+#include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/shared_inc/fpgeneric.h"
 #include "core/providers/cuda/tensor/slice.h"
 
@@ -81,7 +82,7 @@ Status SliceOutUnwantedOutputSection(cudaStream_t stream,
   ORT_THROW_IF_ERROR(SliceBase::PrepareForCompute(starts, ends, axes, compute_metadata));
 
   // As a sanity check, ensure that the slice operator's output shape matches with the expected output shape
-  ORT_ENFORCE(gsl::make_span(compute_metadata.output_dims_) == output_dims);
+  ORT_ENFORCE(SpanEq(gsl::make_span(compute_metadata.output_dims_), output_dims));
 
   return SliceCuda::Impl(stream, input_data, input_dims, output_data, compute_metadata, element_size);
 }
@@ -115,7 +116,7 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
     s_.z_data = nullptr;
   }
   bool input_dims_changed = (s_.last_x_dims != x_dims);
-  bool w_dims_changed = (s_.last_w_dims.AsShapeVector() != w_dims);
+  bool w_dims_changed = (s_.last_w_dims != w_dims);
   if (input_dims_changed || w_dims_changed) {
     if (input_dims_changed)
       s_.last_x_dims = gsl::make_span(x_dims);
diff --git a/onnxruntime/core/providers/cuda/nn/pool.cc b/onnxruntime/core/providers/cuda/nn/pool.cc
index 5f7d89824d..3de6b0d1c7 100644
--- a/onnxruntime/core/providers/cuda/nn/pool.cc
+++ b/onnxruntime/core/providers/cuda/nn/pool.cc
@@ -177,7 +177,7 @@ Status Pool<T, PoolType>::ComputeInternal(OpKernelContext* context) const {
   auto x_data = reinterpret_cast<const CudaT*>(X->Data<T>());
   auto y_data = reinterpret_cast<CudaT*>(Y->MutableData<T>());
 
-  TensorShapeVector x_dims_cudnn(x_dims.cbegin(), x_dims.cend());
+  TensorShapeVector x_dims_cudnn(x_dims.begin(), x_dims.end());
   TensorShapeVector y_dims_cudnn(y_dims);
   if (kernel_shape.size() < 2) {
     // cudnn only takes 4D or 5D input, so pad dimensions if needed
diff --git a/onnxruntime/core/providers/cuda/nn/shrink.h b/onnxruntime/core/providers/cuda/nn/shrink.h
index 7784bcf800..088fb74a99 100644
--- a/onnxruntime/core/providers/cuda/nn/shrink.h
+++ b/onnxruntime/core/providers/cuda/nn/shrink.h
@@ -3,7 +3,6 @@
 
 #pragma once
 
-#include "gsl/gsl"
 #include "core/providers/cuda/cuda_kernel.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h
index 8e62430980..025b1dbfab 100644
--- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h
+++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include <cudnn.h>
 
@@ -105,7 +105,7 @@ class CudnnRnnBase : public CudaKernel {
     ORT_THROW_IF_ERROR(cudnn_dropout_desc_.Set(CudnnHandle(), state_buffer_.get(), state_size));
 
     layout_ = info.GetAttrOrDefault("layout", static_cast<int64_t>(0));
-    ORT_ENFORCE(layout_ == 0, 
+    ORT_ENFORCE(layout_ == 0,
                 "Batchwise recurrent operations (layout == 1) are not supported. If you need support create a github issue with justification.");
   }
 
diff --git a/onnxruntime/core/providers/cuda/rnn/gru.h b/onnxruntime/core/providers/cuda/rnn/gru.h
index 902ad90de1..6f5c5ab6e9 100644
--- a/onnxruntime/core/providers/cuda/rnn/gru.h
+++ b/onnxruntime/core/providers/cuda/rnn/gru.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "cudnn_rnn_base.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "core/providers/cuda/cuda_common.h"
 #include <cudnn.h>
 
diff --git a/onnxruntime/core/providers/cuda/rnn/rnn.h b/onnxruntime/core/providers/cuda/rnn/rnn.h
index e902a8e689..e4e50046b3 100644
--- a/onnxruntime/core/providers/cuda/rnn/rnn.h
+++ b/onnxruntime/core/providers/cuda/rnn/rnn.h
@@ -4,7 +4,6 @@
 #pragma once
 
 #include "cudnn_rnn_base.h"
-#include "gsl/gsl"
 #include "core/providers/cuda/cuda_common.h"
 #include <cudnn.h>
 
diff --git a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
index f7d8fdbfb1..e7137648e7 100644
--- a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
+++ b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
@@ -9,7 +9,7 @@
 #include <memory>
 #include <type_traits>
 #include <vector>
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 
 #include "core/providers/cuda/shared_inc/fast_divmod.h"
 
diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.h b/onnxruntime/core/providers/cuda/tensor/quantize_linear.h
index fe50e9f273..f378778a10 100644
--- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.h
+++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.h
@@ -5,7 +5,6 @@
 
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/cuda/cuda_kernel.h"
-#include "gsl/gsl"
 
 namespace onnxruntime {
 namespace cuda {
diff --git a/onnxruntime/core/providers/cuda/tensor/reshape.h b/onnxruntime/core/providers/cuda/tensor/reshape.h
index 106c01df3a..d6862a86c5 100644
--- a/onnxruntime/core/providers/cuda/tensor/reshape.h
+++ b/onnxruntime/core/providers/cuda/tensor/reshape.h
@@ -5,7 +5,6 @@
 
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/cuda/cuda_kernel.h"
-#include "gsl/gsl"
 #include "core/providers/cpu/tensor/reshape_helper.h"
 
 namespace onnxruntime {
@@ -23,7 +22,7 @@ class Reshape final : public CudaKernel {
     if (shapeTensor == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
     if (shapeTensor->Shape().NumDimensions() != 1) return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "A shape tensor must be a vector tensor, got ", shapeTensor->Shape().NumDimensions(), " dimensions");
     auto data_span = shapeTensor->template DataAsSpan<int64_t>();
-    TensorShapeVector shape(data_span.cbegin(), data_span.cend());
+    TensorShapeVector shape(data_span.begin(), data_span.end());
     const Tensor* X = context->Input<Tensor>(0);
     if (X == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
     const TensorShape& X_shape = X->Shape();
diff --git a/onnxruntime/core/providers/cuda/tensor/transpose.cc b/onnxruntime/core/providers/cuda/tensor/transpose.cc
index d5e100c54a..0870b684f6 100644
--- a/onnxruntime/core/providers/cuda/tensor/transpose.cc
+++ b/onnxruntime/core/providers/cuda/tensor/transpose.cc
@@ -99,7 +99,7 @@ Status Transpose::DoTranspose(const cudaDeviceProp& prop,
   // flatten the adjacent dimensions which are contiguous
   // for example: permutations[0, 2, 3, 1] -> [0, 2, 1], permutations[0, 3, 1, 2] -> [0, 2, 1]
   auto new_rank = rank;
-  InlinedVector<size_t> new_permutations(permutations.cbegin(), permutations.cend());
+  InlinedVector<size_t> new_permutations(permutations.begin(), permutations.end());
   TensorShapeVector new_input_dims = ToShapeVector(input_dims);
   TensorShapeVector new_output_dims = ToShapeVector(output_dims);
 
diff --git a/onnxruntime/core/providers/cuda/tensor/transpose.h b/onnxruntime/core/providers/cuda/tensor/transpose.h
index 1eb79a334f..48a3d0f51a 100644
--- a/onnxruntime/core/providers/cuda/tensor/transpose.h
+++ b/onnxruntime/core/providers/cuda/tensor/transpose.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "core/providers/shared_library/provider_api.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "core/providers/cuda/cuda_kernel.h"
 #include "core/providers/cpu/tensor/transpose.h"
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 64e3f73ec8..416e89e82d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -253,7 +253,7 @@ namespace Dml
         bool hasInputsToBind = false;
         std::vector<DML_BUFFER_BINDING> inputBufferBindings(inputBindings.size());
 
-        for (gsl::index i = 0; i < inputBindings.size(); i++)
+        for (size_t i = 0; i < inputBindings.size(); i++)
         {
             if (inputBindings[i].Buffer)
             {
@@ -662,11 +662,11 @@ namespace Dml
         uint32_t deviceDataTypeMask = GetSupportedDeviceDataTypeMask(); // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
 
         std::vector<std::unique_ptr<onnxruntime::ComputeCapability>> result;
-        
+
         // Get the list of node indices in toplogical order, so nodes are visited before
         // downstream nodes consuming them.
         const std::vector<onnxruntime::NodeIndex>& toplogicalOrder = graph.GetNodesInTopologicalOrder();
-        for (size_t nodeIndex : toplogicalOrder) 
+        for (size_t nodeIndex : toplogicalOrder)
         {
             const onnxruntime::Node& node = *graph.GetNode(nodeIndex);
             if (IsNodeSupportedByDml(node, kernel_lookup, deviceDataTypeMask))
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMeanVarianceNormalization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMeanVarianceNormalization.cpp
index 92d26ab9f5..cd752ddace 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMeanVarianceNormalization.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMeanVarianceNormalization.cpp
@@ -24,7 +24,8 @@ public:
         {
             int32_t crossChannelAxes[] = { 0, 1, 2, 3 };
             int32_t nonChannelAxes[] = {0, 2, 3};
-            gsl::span<int32_t> defaultAxes(acrossChannels ? gsl::make_span(crossChannelAxes) : gsl::make_span(nonChannelAxes));
+            gsl::span<int32_t> defaultAxes(acrossChannels ? gsl::span<int32_t>(crossChannelAxes)
+                                                          : gsl::span<int32_t>(nonChannelAxes));
             onnxAxes.assign(defaultAxes.begin(), defaultAxes.end());
         }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
index 8d9a8d10dd..83737d2ba4 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
@@ -24,7 +24,7 @@
 #include <wil/wrl.h>
 #include <wil/result.h>
 
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 
 #ifdef _GAMING_XBOX_SCARLETT
 #include <d3d12_xs.h>
diff --git a/onnxruntime/core/providers/dml/GraphTransformers/precomp.h b/onnxruntime/core/providers/dml/GraphTransformers/precomp.h
index 7b146e3c4d..a2cce6baed 100644
--- a/onnxruntime/core/providers/dml/GraphTransformers/precomp.h
+++ b/onnxruntime/core/providers/dml/GraphTransformers/precomp.h
@@ -14,4 +14,4 @@
 #include <wil/wrl.h>
 #include <wil/result.h>
 
-#include <gsl/gsl>
+#include "core/common/gsl.h"
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
index 22d39fe685..2d0bc378dd 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
@@ -1448,6 +1448,10 @@ namespace OperatorHelper
             return std::equal(a.begin(), a.end(), b.begin(), b.end());
         };
 
+        auto as_span = [](std::initializer_list<uint32_t> il) {
+            return gsl::make_span(il.begin(), il.size());
+        };
+
         std::array<uint32_t, 3> componentRanks;
         if (m_components.size() > componentRanks.size())
         {
@@ -1498,8 +1502,8 @@ namespace OperatorHelper
         struct RecognizedOperatorInfo
         {
             RecognizedOperatorType recognizedOperatorType;
-            std::initializer_list<const uint32_t> componentRanks;
-            std::initializer_list<const uint32_t> labelIndices;
+            std::initializer_list<uint32_t> componentRanks;
+            std::initializer_list<uint32_t> labelIndices;
         };
 
         const RecognizedOperatorInfo recognizedOperators[] = {
@@ -1523,7 +1527,7 @@ namespace OperatorHelper
         // For each recognized operator, compare the labels-per-component and label indices.
         for (auto& recognizedOperator : recognizedOperators)
         {
-            if (equals(m_labelIndices, recognizedOperator.labelIndices)
+            if (equals(m_labelIndices, as_span(recognizedOperator.labelIndices))
             &&  m_components.size() == recognizedOperator.componentRanks.size())
             {
                 for (size_t i = 0; i < m_components.size(); ++i)
@@ -1531,7 +1535,7 @@ namespace OperatorHelper
                     componentRanks[i] = m_components[i].GetDimensionCount();
                 }
 
-                if (equals(gsl::make_span(componentRanks.data(), m_components.size()), recognizedOperator.componentRanks))
+                if (equals(gsl::make_span(componentRanks.data(), m_components.size()), as_span(recognizedOperator.componentRanks)))
                 {
                     return recognizedOperator.recognizedOperatorType;
                 }
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/precomp.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/precomp.h
index 6c47e60e63..a64d1e01c6 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/precomp.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/precomp.h
@@ -17,4 +17,4 @@
 #include <wil/wrl.h>
 #include <wil/result.h>
 
-#include <gsl/gsl>
+#include "core/common/gsl.h"
diff --git a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
index ea5832bb1d..bb0d619b7e 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
+++ b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
@@ -14,7 +14,6 @@
 
 #include <iomanip>
 #include <fstream>
-#include "gsl/gsl"
 #define ORT_API_MANUAL_INIT
 #include "core/session/onnxruntime_cxx_api.h"
 
@@ -25,9 +24,9 @@ constexpr const char* DNNL_CPU = "DnnlCpu";
 
 DNNLExecutionProvider::DNNLExecutionProvider(const DNNLExecutionProviderInfo& info)
     : IExecutionProvider{onnxruntime::kDnnlExecutionProvider, true} {
-  
+
   InitProviderOrtApi();
-  
+
   AllocatorCreationInfo default_memory_info(
       {[](int) {
         return onnxruntime::CreateCPUAllocator(OrtMemoryInfo(DNNL, OrtAllocatorType::OrtDeviceAllocator));
@@ -312,7 +311,7 @@ Status DNNLExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fuse
 
     compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
       Ort::KernelContext ctx(context);
-      
+
       ort_dnnl::DnnlSubgraphPrimitive* subgraph_primitive = reinterpret_cast<ort_dnnl::DnnlSubgraphPrimitive*>(state);
 
       const size_t subgraph_num_inputs = subgraph_primitive->GetOrderedInputs().size();
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h
index 11a6491bc4..b8e9079d02 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h
@@ -112,7 +112,7 @@ class DnnlSubgraphPrimitive {
   //for memory debug purpose
   std::vector<std::pair<int,int>> items_to_print_;
   void PrintMemory(const dnnl::memory& mem);
-  
+
 };
 
 }  // namespace ort_dnnl
@@ -123,14 +123,13 @@ inline std::ostream& operator<<(std::ostream& os, const dnnl::memory::dims& dims
 }
 
 inline std::ostream& operator<<(std::ostream& os, const gsl::span<const int64_t>& span) {
-  std::copy(span.cbegin(), span.cend(), std::ostream_iterator<int64_t>(os, " "));
+  std::copy(span.begin(), span.end(), std::ostream_iterator<int64_t>(os, " "));
   return os;
 }
 
 inline std::ostream& operator<<(std::ostream& os, const gsl::span<int64_t>& span) {
-  std::copy(span.cbegin(), span.cend(), std::ostream_iterator<int64_t>(os, " "));
+  std::copy(span.begin(), span.end(), std::ostream_iterator<int64_t>(os, " "));
   return os;
 }
 
 }  // namespace onnxruntime
-
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc
index c66f0fb1fc..5b36ee9a78 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc
@@ -140,7 +140,7 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
                               const char* name, const Shape& shape, const gsl::span<const int64_t>& param_raw_data) {
     std::vector<int32_t> param_data;
     param_data.reserve(param_raw_data.size());
-    std::transform(param_raw_data.cbegin(), param_raw_data.cend(),
+    std::transform(param_raw_data.begin(), param_raw_data.end(),
                    std::back_inserter(param_data),
                    [](int64_t i) { return SafeInt<int32_t>(i); });
     std::string param_name = model_builder.GetUniqueName(node_unit.Name() + name);
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc
index 2f37c54c87..023eec26ee 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc
@@ -8,10 +8,11 @@
 #include <optional>
 #include <utility>
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "core/common/inlined_containers_fwd.h"
 #include "core/common/logging/logging.h"
+#include "core/common/narrow.h"
 #include "core/common/safeint.h"
 #include "core/common/span_utils.h"
 #include "core/framework/tensorprotoutils.h"
@@ -135,7 +136,7 @@ Status AddNnapiSplit(ModelBuilder& model_builder,
   const auto input_rank = shaper[input].size();
   axis = static_cast<int32_t>(HandleNegativeAxis(axis, input_rank));
 
-  const auto count = gsl::narrow<int32_t>(outputs.size());
+  const auto count = narrow<int32_t>(outputs.size());
 
   // Calculate split output shape
   {
@@ -314,7 +315,7 @@ Status BuildBatchMatMul(ModelBuilder& model_builder, const NodeUnit& node_unit)
     std::vector<int32_t> new_shape_i32{};
     new_shape_i32.reserve(new_shape.size());
     std::transform(new_shape.begin(), new_shape.end(), std::back_inserter(new_shape_i32),
-                   [](uint32_t d) { return gsl::narrow<int32_t>(d); });
+                   [](uint32_t d) { return narrow<int32_t>(d); });
     ORT_RETURN_IF_ERROR(AddNnapiReshape(model_builder, input, new_shape_name, new_shape_i32, output));
     return Status::OK();
   };
@@ -966,8 +967,6 @@ Status AddMinMaxOperator(ModelBuilder& model_builder, const NodeUnit& node_unit,
                                                  {output}, {output_operand_type}));
 
   return Status::OK();
-
-  return Status::OK();
 }
 
 // We can skip the Reshape if all the output edges satisfies both the following conditions
@@ -1304,8 +1303,8 @@ Status PerformBroadcasting(const Shape& shape1, const Shape& shape2, Shape& outp
   bool shape1_is_bigger = shape1.size() >= shape2.size();
   auto max_shape = shape1_is_bigger ? shape1 : shape2;
   const auto& min_shape = shape1_is_bigger ? shape2 : shape1;
-  for (int i = gsl::narrow<int>(max_shape.size()) - 1,
-           j = gsl::narrow<int>(min_shape.size()) - 1;
+  for (int i = narrow<int>(max_shape.size()) - 1,
+           j = narrow<int>(min_shape.size()) - 1;
        i >= 0 && j >= 0;
        i--, j--) {
     int dim_max_shape = max_shape[i];
diff --git a/onnxruntime/core/providers/rocm/miopen_common.cc b/onnxruntime/core/providers/rocm/miopen_common.cc
index 7a6afb4b92..5c533312a3 100644
--- a/onnxruntime/core/providers/rocm/miopen_common.cc
+++ b/onnxruntime/core/providers/rocm/miopen_common.cc
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #include "miopen_common.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include "core/providers/cpu/tensor/utils.h"
 #include "core/providers/rocm/shared_inc/rocm_call.h"
 
diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc
index 62858476f8..301fe1711d 100644
--- a/onnxruntime/core/providers/rocm/nn/conv.cc
+++ b/onnxruntime/core/providers/rocm/nn/conv.cc
@@ -1,8 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/rocm/rocm_common.h"
 #include "core/providers/rocm/nn/conv.h"
+#include "core/common/span_utils.h"
+#include "core/providers/rocm/rocm_common.h"
 #include "core/providers/rocm/shared_inc/fpgeneric.h"
 #include "core/providers/rocm/tensor/slice.h"
 
@@ -80,7 +81,7 @@ Status SliceOutUnwantedOutputSection(hipStream_t stream,
   ORT_THROW_IF_ERROR(SliceBase::PrepareForCompute(starts, ends, axes, compute_metadata));
 
   // As a sanity check, ensure that the slice operator's output shape matches with the expected output shape
-  ORT_ENFORCE(gsl::make_span(compute_metadata.output_dims_) == output_dims);
+  ORT_ENFORCE(SpanEq(gsl::make_span(compute_metadata.output_dims_), output_dims));
 
   return SliceRocm::Impl(stream, input_data, input_dims, output_data, compute_metadata, element_size);
 }
@@ -90,7 +91,7 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
   //set X
   const Tensor* X = context->Input<Tensor>(0);
   const TensorShape& x_shape = X->Shape();
-  const auto x_dims = x_shape.GetDims();
+  const auto x_dims = x_shape.AsShapeVector();
   s_.x_data = reinterpret_cast<const HipT*>(X->Data<T>());
   s_.element_size = X->DataType()->Size();
   //set W
@@ -113,11 +114,11 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
   } else {
     s_.z_data = nullptr;
   }
-  bool input_dims_changed = (s_.last_x_dims.GetDims() != x_dims);
-  bool w_dims_changed = (s_.last_w_dims.GetDims() != gsl::make_span(w_dims));
+  bool input_dims_changed = (s_.last_x_dims != x_dims);
+  bool w_dims_changed = (s_.last_w_dims != w_dims);
   if (input_dims_changed || w_dims_changed) {
     if (input_dims_changed)
-      s_.last_x_dims = x_dims;
+      s_.last_x_dims = gsl::make_span(x_dims);
 
     if (w_dims_changed) {
       s_.last_w_dims = gsl::make_span(w_dims);
diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
index ab75cc47db..3f1b2433b2 100644
--- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
@@ -68,8 +68,8 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
   {
     std::lock_guard<OrtMutex> lock(s_.mutex);
     // TODO: add a global cache if need to handle cases for multiple frames running simultaneously with different batch_size
-    bool input_dims_changed = (s_.last_x_dims.GetDims() != gsl::make_span(x_dims));
-    bool w_dims_changed = (s_.last_w_dims.GetDims() != gsl::make_span(w_dims));
+    bool input_dims_changed = (s_.last_x_dims.AsShapeVector() != x_dims);
+    bool w_dims_changed = (s_.last_w_dims.AsShapeVector() != w_dims);
     if (input_dims_changed || w_dims_changed) {
       if (input_dims_changed)
         s_.last_x_dims = gsl::make_span(x_dims);
diff --git a/onnxruntime/core/providers/rocm/rocm_common.h b/onnxruntime/core/providers/rocm/rocm_common.h
index b179f941f8..6a24883757 100644
--- a/onnxruntime/core/providers/rocm/rocm_common.h
+++ b/onnxruntime/core/providers/rocm/rocm_common.h
@@ -10,7 +10,7 @@
 #include "core/providers/rocm/shared_inc/rocm_call.h"
 #include "core/providers/rocm/shared_inc/fast_divmod.h"
 #include "core/util/math.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
 namespace rocm {
diff --git a/onnxruntime/core/providers/rocm/rocm_provider_factory.cc b/onnxruntime/core/providers/rocm/rocm_provider_factory.cc
index 852c8d45e1..a0d2809f3d 100644
--- a/onnxruntime/core/providers/rocm/rocm_provider_factory.cc
+++ b/onnxruntime/core/providers/rocm/rocm_provider_factory.cc
@@ -6,7 +6,7 @@
 
 #include <memory>
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "core/providers/rocm/rocm_execution_provider.h"
 #include "core/providers/rocm/rocm_execution_provider_info.h"
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index 546eecb2f5..b60ef5be4a 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -12,7 +12,7 @@
 #include <vector>
 #include <string>
 #include <map>
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 #include <unordered_map>
 #include <unordered_set>
 #include <stddef.h>
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index adf2c4ec2d..b74a17709c 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -886,7 +886,7 @@ inline Status OpKernelInfo::GetAttrs(const std::string& name, TensorShapeVector&
   Status status = this->GetAttrsAsSpan<int64_t>(name, span);
   if (status.IsOK()) {
     out.reserve(span.size());
-    out.assign(span.cbegin(), span.cend());
+    out.assign(span.begin(), span.end());
   }
   return status;
 }
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index d9184367f4..7b2c327c4d 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -13,7 +13,7 @@
 #include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
 #include "core/providers/cuda/gpu_data_transfer.h"
 #include "cuda_runtime_api.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 #include <unordered_map>
 #include <utility>
 #include <limits>
diff --git a/onnxruntime/core/providers/xnnpack/detail/utils.cc b/onnxruntime/core/providers/xnnpack/detail/utils.cc
index 3d80921694..a8820acb39 100644
--- a/onnxruntime/core/providers/xnnpack/detail/utils.cc
+++ b/onnxruntime/core/providers/xnnpack/detail/utils.cc
@@ -373,7 +373,7 @@ gsl::span<const T> ReadConstantValues(const OpKernelInfo& info, int idx) {
     } else {
       // It's legal for zero-point to be null, we just give its default value 0
       static const T default_zp[] = {0};
-      return gsl::make_span(default_zp, static_cast<typename gsl::span<T>::index_type>(1));
+      return gsl::make_span(default_zp, static_cast<typename gsl::span<T>::size_type>(1));
     }
   }
   return (tensor->DataAsSpan<T>());
diff --git a/onnxruntime/core/providers/xnnpack/nn/conv.cc b/onnxruntime/core/providers/xnnpack/nn/conv.cc
index 01067d7ab0..a4ba644361 100644
--- a/onnxruntime/core/providers/xnnpack/nn/conv.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/conv.cc
@@ -10,7 +10,7 @@
 #include "core/providers/utils.h"
 #include "core/providers/xnnpack/detail/utils.h"
 #include "core/framework/tensorprotoutils.h"
-#include "gsl/gsl-lite.hpp"
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
 namespace xnnpack {
diff --git a/onnxruntime/core/session/abi_session_options.cc b/onnxruntime/core/session/abi_session_options.cc
index 92fcc861d0..a914b1bbac 100644
--- a/onnxruntime/core/session/abi_session_options.cc
+++ b/onnxruntime/core/session/abi_session_options.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "core/graph/onnx_protobuf.h"
-#include "core/common/gsl_suppress.h"
 #include "core/common/inlined_containers.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "core/session/ort_apis.h"
@@ -220,5 +219,3 @@ ORT_API_STATUS_IMPL(OrtApis::AddExternalInitializers, _In_ OrtSessionOptions* op
   return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "External initializers are not supported in this build");
 #endif
 }
-
-
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 097a812b3a..3d8d1fe2c1 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1652,7 +1652,7 @@ common::Status InferenceSession::ValidateInputs(gsl::span<const std::string> fee
     }
 
     auto expected_type = iter->second.ml_data_type;
-    auto& input_ml_value = feeds.at(i);
+    auto& input_ml_value = feeds[i];
     if (input_ml_value.IsTensor()) {
       if (!expected_type->IsTensorType()
 #if !defined(DISABLE_OPTIONAL_TYPE)
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 31b7415850..37e7834c62 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -17,6 +17,7 @@
 
 #include "core/common/common.h"
 #include "core/common/logging/logging.h"
+#include "core/common/narrow.h"
 #include "core/common/status.h"
 #include "core/common/safeint.h"
 #include "core/graph/constants.h"
@@ -83,6 +84,7 @@ using onnxruntime::OutputDefList;
 using onnxruntime::Tensor;
 using onnxruntime::ToOrtStatus;
 using onnxruntime::common::Status;
+using onnxruntime::narrow;
 
 using namespace onnxruntime;
 
@@ -193,11 +195,11 @@ ORT_STATUS_PTR CreateTensorImplForSeq(MLDataType elem_type, const int64_t* shape
 ORT_STATUS_PTR CreateTensorImpl(MLDataType ml_type, const int64_t* shape, size_t shape_len, const OrtMemoryInfo* info,
                                 void* p_data, size_t p_data_len, OrtValue& ort_value) {
   TensorShape tensor_shape(shape, shape_len);
-  if (std::any_of(tensor_shape.GetDims().cbegin(), tensor_shape.GetDims().cend(), [](int64_t v) { return v < 0; })) {
+  if (std::any_of(tensor_shape.GetDims().begin(), tensor_shape.GetDims().end(), [](int64_t v) { return v < 0; })) {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "tried creating tensor with negative value in shape");
   }
 
-  auto elem_count = gsl::narrow<size_t>(tensor_shape.Size());
+  auto elem_count = narrow<size_t>(tensor_shape.Size());
   size_t size_to_allocate;
   if (!IAllocator::CalcMemSizeForArray(ml_type->Size(), elem_count, &size_to_allocate)) {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "size overflow");
@@ -243,7 +245,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateSparseTensorAsOrtValue, _Inout_ OrtAllocator*
   auto element_type = sparse_tensor_type->GetElementType();
   assert(element_type->AsPrimitiveDataType() != nullptr);
   TensorShape shape(dense_shape, dense_shape_len);
-  if (std::any_of(shape.GetDims().cbegin(), shape.GetDims().cend(),
+  if (std::any_of(shape.GetDims().begin(), shape.GetDims().end(),
                   [](int64_t v) { return v < 0; })) {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "tried creating tensor with negative value in shape");
   }
@@ -288,7 +290,7 @@ SparseTensor& ValidateFillInputArgs(OrtValue* v, const TensorShape& values_shape
       ORT_THROW("Strings can only reside in CPU memory");
     }
   }
-  if (std::any_of(values_shape.GetDims().cbegin(), values_shape.GetDims().cend(),
+  if (std::any_of(values_shape.GetDims().begin(), values_shape.GetDims().end(),
                   [](int64_t v) { return v < 0; })) {
     ORT_THROW("tried Filling sparse tensor with negative value in values shape");
   }
@@ -313,7 +315,7 @@ ORT_API_STATUS_IMPL(OrtApis::FillSparseTensorCoo, _Inout_ OrtValue* ort_value, _
   TensorShape values_t_shape(values_shape, values_shape_len);
   auto& sparse_tensor = ValidateFillInputArgs(ort_value, values_t_shape, data_mem_info);
 
-  auto values_size = gsl::narrow<size_t>(values_t_shape.Size());
+  auto values_size = narrow<size_t>(values_t_shape.Size());
   auto indices_span = gsl::make_span(indices_data, indices_num);
 
   if (sparse_tensor.IsDataTypeString()) {
@@ -347,7 +349,7 @@ ORT_API_STATUS_IMPL(OrtApis::FillSparseTensorCsr, _Inout_ OrtValue* ort_value, _
 #if !defined(DISABLE_SPARSE_TENSORS)
   TensorShape values_t_shape(values_shape, values_shape_len);
   auto& sparse_tensor = ValidateFillInputArgs(ort_value, values_t_shape, data_mem_info);
-  auto values_size = gsl::narrow<size_t>(values_t_shape.Size());
+  auto values_size = narrow<size_t>(values_t_shape.Size());
 
   auto inner_indices_span = gsl::make_span(inner_indices_data, inner_indices_num);
   auto outer_indices_span = gsl::make_span(outer_indices_data, outer_indices_num);
@@ -385,7 +387,7 @@ ORT_API_STATUS_IMPL(OrtApis::FillSparseTensorBlockSparse, _Inout_ OrtValue* ort_
   auto& sparse_tensor = ValidateFillInputArgs(ort_value, values_t_shape, data_mem_info);
 
   TensorShape indices_t_shape(indices_shape_data, indices_shape_len);
-  if (std::any_of(indices_t_shape.GetDims().cbegin(), indices_t_shape.GetDims().cend(),
+  if (std::any_of(indices_t_shape.GetDims().begin(), indices_t_shape.GetDims().end(),
                   [](int64_t v) { return v < 0; })) {
     ORT_THROW("tried Filling sparse tensor with negative value in block sparse indices shape");
   }
@@ -430,7 +432,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateSparseTensorWithValuesAsOrtValue, _In_ const
   }
   TensorShape tensor_dense_shape(dense_shape, dense_shape_len);
   TensorShape tensor_values_shape(values_shape, values_shape_len);
-  if (std::any_of(tensor_values_shape.GetDims().cbegin(), tensor_values_shape.GetDims().cend(),
+  if (std::any_of(tensor_values_shape.GetDims().begin(), tensor_values_shape.GetDims().end(),
                   [](int64_t v) { return v < 0; })) {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "tried creating tensor with negative value in shape");
   }
@@ -1563,7 +1565,7 @@ static ORT_STATUS_PTR OrtGetValueImplSeqOfMap(const OrtValue* p_ml_value, int in
 
 ORT_STATUS_PTR PopulateTensorWithData(Tensor& tensor, bool is_string, _In_ const void* data_elem, size_t num_elems,
                                       size_t elem_size) {
-  auto len = gsl::narrow<size_t>(tensor.Shape().Size());
+  auto len = narrow<size_t>(tensor.Shape().Size());
   if (num_elems < len) {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "input array is too short");
   }
@@ -1573,7 +1575,7 @@ ORT_STATUS_PTR PopulateTensorWithData(Tensor& tensor, bool is_string, _In_ const
     const std::string* strings = reinterpret_cast<const std::string*>(data_elem);
     auto str_span = gsl::make_span(strings, num_elems);
     auto* dst = tensor.MutableData<std::string>();
-    std::copy(str_span.cbegin(), str_span.cend(), dst);
+    std::copy(str_span.begin(), str_span.end(), dst);
   }
   return nullptr;
 }
@@ -1600,7 +1602,7 @@ static ORT_STATUS_PTR OrtGetValueImplSeqOfTensors(_In_ const OrtValue* p_ml_valu
   auto result = std::make_unique<OrtValue>();
   ORT_API_RETURN_IF_ERROR(c_api_internal::CreateTensorAndPopulate(one_tensor.DataType(), tensor_shape.GetDims().data(),
                                                                   tensor_shape.NumDimensions(), one_tensor.DataRaw(),
-                                                                  gsl::narrow<size_t>(one_tensor.Shape().Size()),
+                                                                  narrow<size_t>(one_tensor.Shape().Size()),
                                                                   allocator, *result));
   *out = result.release();
   return nullptr;
@@ -1759,7 +1761,7 @@ static ORT_STATUS_PTR OrtCreateValueImplSeqHelperTensor(const Tensor& tensor, Te
   ORT_API_RETURN_IF_ERROR(CreateTensorImplForSeq(data_type,
                                                  tensor.Shape().GetDims().data(), tensor.Shape().NumDimensions(),
                                                  out));
-  size_t num_elements = gsl::narrow<size_t>(tensor.Shape().Size());
+  size_t num_elements = narrow<size_t>(tensor.Shape().Size());
   ORT_API_RETURN_IF_ERROR(c_api_internal::PopulateTensorWithData(out, tensor.IsDataTypeString(),
                                                                  tensor.DataRaw(), num_elements, data_type->Size()));
   return nullptr;
@@ -2003,7 +2005,7 @@ ORT_API_STATUS_IMPL(OrtApis::GetAvailableProviders, _Outptr_ char*** out_ptr,
   // and use a single string object to hold all the names.
   constexpr size_t MAX_LEN = 30;
   const auto& available_providers = GetAvailableExecutionProviderNames();
-  const int available_count = gsl::narrow<int>(available_providers.size());
+  const int available_count = narrow<int>(available_providers.size());
   char** const out = new char*[available_count];
   if (out) {
     for (int i = 0; i < available_count; i++) {
diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc
index 164e88573c..a505b8e018 100644
--- a/onnxruntime/core/util/math_cpu.cc
+++ b/onnxruntime/core/util/math_cpu.cc
@@ -19,7 +19,7 @@
 #include "core/util/math.h"
 
 #include <algorithm>
-#include <gsl/gsl>
+#include "core/common/narrow.h"
 #include "core/mlas/inc/mlas.h"
 #if defined(__GNUC__)
 #pragma GCC diagnostic push
@@ -451,7 +451,7 @@ void Im2col<T, StorageOrder::NHWC>::operator()(
             if (is_a_ge_zero_and_a_lt_b(iw, input_w)) {
               // Increase the copy count size to reduce the number of copy calls.
               int64_t batch_w = std::min(kw, input_w - iw);
-              std::memcpy(data_col, data_im + (ih * input_w + iw) * group_channels, gsl::narrow<size_t>(sizeof(T) * batch_w * group_channels));
+              std::memcpy(data_col, data_im + (ih * input_w + iw) * group_channels, narrow<size_t>(sizeof(T) * batch_w * group_channels));
               data_col += batch_w * group_channels;
               iw += batch_w;
               kw -= batch_w;
@@ -466,7 +466,7 @@ void Im2col<T, StorageOrder::NHWC>::operator()(
             if (is_a_ge_zero_and_a_lt_b(iw, input_w)) {
               // N.B. Using std::memcpy helped here over std::copy_n when doing a
               // transform for an image with a small number of group channels.
-              std::memcpy(data_col, data_im + (ih * input_w + iw) * input_channels, gsl::narrow<size_t>(sizeof(T) * group_channels));
+              std::memcpy(data_col, data_im + (ih * input_w + iw) * input_channels, narrow<size_t>(sizeof(T) * group_channels));
               data_col += group_channels;
             } else {
               data_col = std::fill_n(data_col, group_channels, padding_value);
@@ -668,7 +668,7 @@ void Col2im<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, int64
   const int64_t output_hw = output_h * output_w;
   const int64_t hw = height * width;
   const int64_t hwc = hw * channels;
-  Set<float, CPUMathUtil>(gsl::narrow<ptrdiff_t>(hwc), 0, data_im, context);
+  Set<float, CPUMathUtil>(narrow<ptrdiff_t>(hwc), 0, data_im, context);
 
   // Fast path for zero padding and no dilation
   // From Torch, modified THNN_(unfolded_acc)
@@ -756,7 +756,7 @@ void Col2im<float, CPUMathUtil, StorageOrder::NHWC>(const float* data_col, int64
   const int64_t dkernel_w = dilation_w * (kernel_w - 1) + 1;
 
   const int64_t hwc = height * width * channels;
-  Set<float, CPUMathUtil>(gsl::narrow<ptrdiff_t>(hwc), 0, data_im, context);
+  Set<float, CPUMathUtil>(narrow<ptrdiff_t>(hwc), 0, data_im, context);
   int64_t height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
   int64_t width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
   int64_t h_pad = -pad_t;
@@ -785,7 +785,7 @@ void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, con
                                                       const int64_t* kernel_shape, const int64_t* stride,
                                                       const int64_t* dilation, const int64_t* pad, ptrdiff_t N,
                                                       float* data_img, CPUMathUtil* context) {
-  Set<float, CPUMathUtil>(gsl::narrow<ptrdiff_t>(img_size), 0, data_img, context);
+  Set<float, CPUMathUtil>(narrow<ptrdiff_t>(img_size), 0, data_img, context);
   Im2col<float, StorageOrder::NCHW>()(
       data_col,
       img_shape,
diff --git a/onnxruntime/gsl/gsl b/onnxruntime/gsl/gsl
deleted file mode 100644
index be3f2ccdb2..0000000000
--- a/onnxruntime/gsl/gsl
+++ /dev/null
@@ -1,27 +0,0 @@
-//
-// gsl-lite is based on GSL: Guidelines Support Library.
-// For more information see https://github.com/martinmoene/gsl-lite
-//
-// Copyright (c) 2015 Martin Moene
-// Copyright (c) 2015 Microsoft Corporation. All rights reserved. 
-// 
-// This code is licensed under the MIT License (MIT). 
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
-// THE SOFTWARE. 
-
-// mimic MS include hierarchy
-
-#pragma once
-
-#ifndef GSL_GSL_H_INCLUDED
-#define GSL_GSL_H_INCLUDED
-
-#include "gsl-lite.hpp"
-
-#endif // GSL_GSL_H_INCLUDED
diff --git a/onnxruntime/gsl/gsl-lite-vc6.hpp b/onnxruntime/gsl/gsl-lite-vc6.hpp
deleted file mode 100644
index 8024c0ca51..0000000000
--- a/onnxruntime/gsl/gsl-lite-vc6.hpp
+++ /dev/null
@@ -1,697 +0,0 @@
-//
-// gsl-lite-vc6 is based on GSL: Guidelines Support Library,
-// For more information see https://github.com/martinmoene/gsl-lite
-//
-// Copyright (c) 2015 Martin Moene
-// Copyright (c) 2015 Microsoft Corporation. All rights reserved.
-//
-// This code is licensed under the MIT License (MIT).
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-#pragma once
-
-#ifndef GSL_GSL_LITE_H_INCLUDED
-#define GSL_GSL_LITE_H_INCLUDED
-
-#include <exception>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <utility>
-#include <vector>
-
-#define  gsl_lite_VERSION "0.0.0"
-
-// Configuration:
-
-#ifndef  gsl_FEATURE_IMPLICIT_MACRO
-# define gsl_FEATURE_IMPLICIT_MACRO  1
-#endif
-
-#ifndef  gsl_FEATURE_OWNER_MACRO
-# define gsl_FEATURE_OWNER_MACRO  1
-#endif
-
-#ifndef  gsl_FEATURE_SHARED_PTR
-# define gsl_FEATURE_SHARED_PTR  0
-#endif
-
-#ifndef  gsl_FEATURE_UNIQUE_PTR
-# define gsl_FEATURE_UNIQUE_PTR  0
-#endif
-
-#ifndef  gsl_CONFIG_THROWS_FOR_TESTING
-# define gsl_CONFIG_THROWS_FOR_TESTING  0
-#endif
-
-#ifndef  gsl_CONFIG_CONFIRMS_COMPILATION_ERRORS
-# define gsl_CONFIG_CONFIRMS_COMPILATION_ERRORS  0
-#endif
-
-#ifndef  gsl_CONFIG_SHARED_PTR_INCLUDE
-# define gsl_CONFIG_SHARED_PTR_INCLUDE  <boost/shared_ptr.hpp>
-#endif
-
-#ifndef  gsl_CONFIG_UNIQUE_PTR_INCLUDE
-# define gsl_CONFIG_UNIQUE_PTR_INCLUDE  <boost/unique_ptr.hpp>
-#endif
-
-#ifndef  gsl_CONFIG_SHARED_PTR_DECL
-# define gsl_CONFIG_SHARED_PTR_DECL  boost::shared_ptr
-#endif
-
-#ifndef  gsl_CONFIG_UNIQUE_PTR_DECL
-# define gsl_CONFIG_UNIQUE_PTR_DECL  boost::unique_ptr
-#endif
-
-// Compiler detection:
-
-#if defined(_MSC_VER ) && !defined(__clang__)
-# define gsl_COMPILER_MSVC_VER      (_MSC_VER )
-# define gsl_COMPILER_MSVC_VERSION  (_MSC_VER / 10 - 10 * ( 5 + (_MSC_VER < 1900 ) ) )
-#else
-# define gsl_COMPILER_MSVC_VER       0
-# define gsl_COMPILER_MSVC_VERSION   0
-# define gsl_COMPILER_NON_MSVC       1
-#endif
-
-#if gsl_COMPILER_MSVC_VERSION != 60
-# error GSL Lite: this header is for Visual C++ 6
-#endif
-
-// half-open range [lo..hi):
-#define gsl_BETWEEN( v, lo, hi ) ( (lo) <= (v) && (v) < (hi) )
-
-// Presence of C++ language features:
-
-// C++ feature usage:
-
-#if gsl_FEATURE_IMPLICIT_MACRO
-# define implicit
-#endif
-
-#define gsl_DIMENSION_OF( a ) ( sizeof(a) / sizeof(0[a]) )
-
-#if gsl_FEATURE_SHARED_PTR
-# include gsl_CONFIG_SHARED_PTR_INCLUDE
-#endif
-
-#if gsl_FEATURE_UNIQUE_PTR
-# include gsl_CONFIG_UNIQUE_PTR_INCLUDE
-#endif
-
-namespace gsl {
-
-//
-// GSL.owner: ownership pointers
-//
-// ToDo:
-#if gsl_FEATURE_SHARED_PTR
-  using gsl_CONFIG_SHARED_PTR_DECL;
-#endif
-#if gsl_FEATURE_UNIQUE_PTR
-  using gsl_CONFIG_UNIQUE_PTR_DECL;
-#endif
-
-template< class T > struct owner { typedef T type; };
-
-#define gsl_HAVE_OWNER_TEMPLATE  0
-
-#if gsl_FEATURE_OWNER_MACRO
-# define Owner(t)  ::gsl::owner<t>::type
-#endif
-
-//
-// GSL.assert: assertions
-//
-#define Expects(x)  ::gsl::fail_fast_assert((x))
-#define Ensures(x)  ::gsl::fail_fast_assert((x))
-
-#if gsl_CONFIG_THROWS_FOR_TESTING
-
-struct fail_fast : public std::runtime_error
-{
-    fail_fast()
-    : std::runtime_error( "GSL assertion" ) {}
-
-    explicit fail_fast( char const * const message )
-    : std::runtime_error( message ) {}
-};
-
-inline void fail_fast_assert( bool cond )
-{
-    if ( !cond )
-        throw fail_fast();
-}
-
-inline void fail_fast_assert( bool cond, char const * const message )
-{
-    if ( !cond )
-        throw fail_fast( message );
-}
-
-#else // gsl_CONFIG_THROWS_FOR_TESTING
-
-inline void fail_fast_assert( bool cond )
-{
-    if ( !cond )
-        terminate();
-}
-
-inline void fail_fast_assert( bool cond, char const * const )
-{
-    if ( !cond )
-        terminate();
-}
-
-#endif // gsl_CONFIG_THROWS_FOR_TESTING
-
-//
-// GSL.util: utilities
-//
-
-class final_action
-{
-public:
-    typedef void (*Action)();
-
-    final_action( Action action )
-    : action_( action ) {}
-
-    ~final_action()
-    {
-        action_();
-    }
-
-private:
-    Action action_;
-};
-
-template< class Fn >
-final_action finally( Fn const & f )
-{
-    return final_action(( f ));
-}
-
-template< class T, class U >
-T narrow_cast( U u )
-{
-    return static_cast<T>( u );
-}
-
-struct narrowing_error : public std::exception {};
-
-template< class T, class U >
-T narrow( U u )
-{
-    T t = narrow_cast<T>( u );
-
-    if ( static_cast<U>( t ) != u )
-    {
-        throw narrowing_error();
-    }
-    return t;
-}
-
-//
-// GSL.views: views
-//
-
-//
-// at() - Bounds-checked way of accessing static arrays, std::array, std::vector.
-//
-
-namespace detail {
-
-struct precedence_0 {};
-struct precedence_1 : precedence_0 {};
-struct order_precedence : precedence_1 {};
-
-template< class Array, class T >
-T & at( Array & arr, size_t index, T*, precedence_0 const & )
-{
-    Expects( index < gsl_DIMENSION_OF( arr ) );
-    return arr[index];
-}
-
-} // namespace detail
-
-// Create an at( container ) function:
-
-# define gsl_MK_AT( Cont ) \
-    namespace gsl { namespace detail { \
-    template< class T > \
-    inline T & at( Cont<T> & cont, size_t index, T*, precedence_1 const & ) \
-    { \
-        Expects( index < cont.size() ); \
-        return cont[index]; \
-    } }}
-
-template< class Cont >
-int & at( Cont & cont, size_t index )
-{
-    return detail::at( cont, index, &cont[0], detail::order_precedence() );
-}
-
-//
-// not_null<> - Wrap any indirection and enforce non-null.
-//
-template<class T>
-class not_null
-{
-public:
-    not_null(             T t         ) : ptr_ ( t ){ Expects( ptr_ != NULL ); }
-    not_null & operator=( T const & t ) { ptr_ = t ;  Expects( ptr_ != NULL ); return *this; }
-
-    not_null(             not_null const & other ) : ptr_ ( other.ptr_  ) {}
-    not_null & operator=( not_null const & other ) { ptr_ = other.ptr_; }
-
-    // VC6 accepts this anyway:
-    // template< typename U > not_null( not_null<U> const & other );
-    // template< typename U > not_null & operator=( not_null<U> const & other ) ;
-
-private:
-    // Prevent compilation when initialized with a literal 0:
-    not_null(             int );
-    not_null & operator=( int );
-
-public:
-    T get() const
-    {
-        return ptr_;
-    }
-
-         operator T() const { return get(); }
-    T    operator->() const { return get(); }
-
-    bool operator==(T const & rhs) const { return    ptr_ == rhs; }
-    bool operator!=(T const & rhs) const { return !(*this == rhs); }
-
-private:
-    T ptr_;
-
-    not_null & operator++();
-    not_null & operator--();
-    not_null   operator++( int );
-    not_null   operator--( int );
-    not_null & operator+ ( size_t );
-    not_null & operator+=( size_t );
-    not_null & operator- ( size_t );
-    not_null & operator-=( size_t );
-};
-
-//
-// Byte-specific type.
-//
-typedef unsigned char byte;
-
-//
-// span<> - A 1D view of contiguous T's, replace (*,len).
-//
-template< class T >
-class span
-{
-public:
-    typedef size_t size_type;
-
-    typedef T value_type;
-    typedef T & reference;
-    typedef T * pointer;
-    typedef T const * const_pointer;
-
-    typedef pointer       iterator;
-    typedef const_pointer const_iterator;
-
-    typedef std::reverse_iterator< iterator, T >             reverse_iterator;
-    typedef std::reverse_iterator< const_iterator, const T > const_reverse_iterator;
-
-    // Todo:
-    // typedef typename std::iterator_traits< iterator >::difference_type difference_type;
-
-    span()
-        : begin_( NULL )
-        , end_  ( NULL )
-    {
-        Expects( size() == 0 );
-    }
-
-    span( pointer begin, pointer end )
-        : begin_( begin )
-        , end_  ( end )
-    {
-        Expects( begin <= end );
-    }
-
-    span( pointer data, size_type size )
-        : begin_( data )
-        , end_  ( data + size )
-    {
-        Expects( size == 0 || ( size > 0 && data != NULL ) );
-    }
-
-private:
-    struct precedence_0 {};
-    struct precedence_1 : precedence_0 {};
-    struct precedence_2 : precedence_1 {};
-    struct order_precedence : precedence_1 {};
-
-    template< class Array, class U >
-    span create( Array & arr, U*, precedence_0 const & ) const
-    {
-        return span( arr, gsl_DIMENSION_OF( arr ) );
-    }
-
-    span create( std::vector<T> & cont, T*, precedence_1 const & ) const
-    {
-        return span( &cont[0], cont.size() );
-    }
-
-public:
-    template< class Cont >
-    span( Cont & cont )
-    {
-        *this = create( cont, &cont[0], order_precedence() );
-    }
-
-#if 0
-    // =default constructor
-    span( span const & other )
-        : begin_( other.begin() )
-        , end_  ( other.end() )
-    {}
-#endif
-
-    span & operator=( span const & other )
-    {
-        // VC6 balks at copy-swap implementation (here),
-        // so we do it the simple way:
-        begin_ = other.begin_;
-        end_   = other.end_;
-        return *this;
-    }
-
-#if 0
-    // Converting from other span ?
-    template< typename U > operator=();
-#endif
-
-    iterator begin() const
-    {
-        return iterator( begin_ );
-    }
-
-    iterator end() const
-    {
-        return iterator( end_ );
-    }
-
-    const_iterator cbegin() const
-    {
-        return const_iterator( begin() );
-    }
-
-    const_iterator cend() const
-    {
-        return const_iterator( end() );
-    }
-
-    reverse_iterator rbegin() const
-    {
-        return reverse_iterator( end() );
-    }
-
-    reverse_iterator rend() const
-    {
-        return reverse_iterator( begin() );
-    }
-
-    const_reverse_iterator crbegin() const
-    {
-        return const_reverse_iterator( cend() );
-    }
-
-    const_reverse_iterator crend() const
-    {
-        return const_reverse_iterator( cbegin() );
-    }
-
-    operator bool () const
-    {
-        return begin_ != NULL;
-    }
-
-    reference operator[]( size_type index )
-    {
-        return at( index );
-    }
-
-    bool operator==( span const & other ) const
-    {
-        return  size() == other.size()
-            && (begin_ == other.begin_ || std::equal( this->begin(), this->end(), other.begin() ) );
-    }
-
-    bool operator!=( span const & other ) const
-    {
-        return !( *this == other );
-    }
-
-    bool operator< ( span const & other ) const
-    {
-        return std::lexicographical_compare( this->begin(), this->end(), other.begin(), other.end() );
-    }
-
-    bool operator<=( span const & other ) const
-    {
-        return !( other < *this );
-    }
-
-    bool operator> ( span const & other ) const
-    {
-        return ( other < *this );
-    }
-
-    bool operator>=( span const & other ) const
-    {
-        return !( *this < other );
-    }
-
-    reference at( size_type index )
-    {
-        Expects( index >= 0 && index < size());
-        return begin_[ index ];
-    }
-
-    pointer data() const
-    {
-        return begin_;
-    }
-
-    bool empty() const
-    {
-        return size() == 0;
-    }
-
-    size_type size() const
-    {
-        return std::distance( begin_, end_ );
-    }
-
-    size_type length() const
-    {
-        return size();
-    }
-
-    size_type used_length() const
-    {
-        return length();
-    }
-
-    size_type bytes() const
-    {
-        return sizeof( value_type ) * size();
-    }
-
-    size_type used_bytes() const
-    {
-        return bytes();
-    }
-
-    void swap( span & other )
-    {
-        using std::swap;
-        swap( begin_, other.begin_ );
-        swap( end_  , other.end_   );
-    }
-
-    span< const byte > as_bytes() const
-    {
-        return span< const byte >( reinterpret_cast<const byte *>( data() ), bytes() );
-    }
-
-    span< byte > as_writeable_bytes() const
-    {
-        return span< byte >( reinterpret_cast<byte *>( data() ), bytes() );
-    }
-
-    template< class U >
-    struct mk
-    {
-        static span<U> view( U * data, size_type size )
-        {
-            return span<U>( data, size );
-        }
-    };
-
-    template< typename U >
-    span< U > as_span( U u = U() ) const
-    {
-        Expects( ( this->bytes() % sizeof(U) ) == 0 );
-        return mk<U>::view( reinterpret_cast<U *>( this->data() ), this->bytes() / sizeof( U ) );
-    }
-
-private:
-    pointer begin_;
-    pointer end_;
-};
-
-// span creator functions (see ctors)
-
-template< typename T>
-span< const byte > as_bytes( span<T> spn )
-{
-    return span< const byte >( reinterpret_cast<const byte *>( spn.data() ), spn.bytes() );
-}
-
-template< typename T>
-span< byte > as_writeable_bytes( span<T> spn )
-{
-    return span< byte >( reinterpret_cast<byte *>( spn.data() ), spn.bytes() );
-}
-
-template< typename T >
-span<T> as_span( T * begin, T * end )
-{
-    return span<T>( begin, end );
-}
-
-template< typename T >
-span<T> as_span( T * begin, size_t size )
-{
-    return span<T>( begin, size );
-}
-
-namespace detail {
-
-template< class T >
-struct mk
-{
-    static span<T> view( std::vector<T> & cont )
-    {
-        return span<T>( cont );
-    }
-};
-}
-
-template< class T >
-span<T> as_span( std::vector<T> & cont )
-{
-    return detail::mk<T>::view( cont );
-}
-
-//
-// String types:
-//
-
-typedef char * zstring;
-typedef wchar_t * zwstring;
-typedef const char * czstring;
-typedef const wchar_t * cwzstring;
-
-typedef span< char > string_span;
-typedef span< wchar_t > wstring_span;
-typedef span< const char > cstring_span;
-typedef span< const wchar_t > cwstring_span;
-
-// to_string() allow (explicit) conversions from string_span to string
-
-inline std::string to_string( string_span const & view )
-{
-    return std::string( view.data(), view.length() );
-}
-
-inline std::string to_string( cstring_span const & view )
-{
-    return std::string( view.data(), view.length() );
-}
-
-inline std::wstring to_string( wstring_span const & view )
-{
-    return std::wstring( view.data(), view.length() );
-}
-
-inline std::wstring to_string( cwstring_span const & view )
-{
-    return std::wstring( view.data(), view.length() );
-}
-
-//
-// ensure_sentinel()
-//
-// Provides a way to obtain a span from a contiguous sequence
-// that ends with a (non-inclusive) sentinel value.
-//
-// Will fail-fast if sentinel cannot be found before max elements are examined.
-//
-namespace detail {
-
-template<class T, class SizeType, const T Sentinel>
-struct ensure
-{
-    static span<T> sentinel( T * seq, SizeType max = (std::numeric_limits<SizeType>::max)() )
-    {
-        typedef T * pointer;
-        typedef typename std::iterator_traits<pointer>::difference_type difference_type;
-
-        pointer cur = seq;
-
-        while ( std::distance( seq, cur ) < static_cast<difference_type>( max ) && *cur != Sentinel )
-            ++cur;
-
-        Expects( *cur == Sentinel );
-
-        return span<T>( seq, cur - seq );
-    }
-};
-} // namespace detail
-
-//
-// ensure_z - creates a string_span for a czstring or cwzstring.
-// Will fail fast if a null-terminator cannot be found before
-// the limit of size_type.
-//
-
-template< typename T >
-span<T> ensure_z( T * sz, size_t max = (std::numeric_limits<size_t>::max)() )
-{
-    return detail::ensure<T, size_t, 0>::sentinel( sz, max );
-}
-
-} // namespace gsl
-
-// at( std::vector ):
-
-gsl_MK_AT( std::vector )
-
-#endif // GSL_GSL_LITE_H_INCLUDED
-
-// end of file
diff --git a/onnxruntime/gsl/gsl-lite.h b/onnxruntime/gsl/gsl-lite.h
deleted file mode 100644
index 99c18d1d29..0000000000
--- a/onnxruntime/gsl/gsl-lite.h
+++ /dev/null
@@ -1,29 +0,0 @@
-//
-// gsl-lite is based on GSL: Guidelines Support Library.
-// For more information see https://github.com/martinmoene/gsl-lite
-//
-// Copyright (c) 2015 Martin Moene
-// Copyright (c) 2015 Microsoft Corporation. All rights reserved. 
-// 
-// This code is licensed under the MIT License (MIT). 
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
-// THE SOFTWARE. 
-
-// mimic MS include hierarchy
-
-#pragma once
-
-#ifndef GSL_GSL_LITE_H_INCLUDED
-#define GSL_GSL_LITE_H_INCLUDED
-
-#pragma message ("gsl-lite.h is deprecated since version 0.27.0, use gsl-lite.hpp instead.")
-
-#include "gsl-lite.hpp"
-
-#endif // GSL_GSL_LITE_H_INCLUDED
diff --git a/onnxruntime/gsl/gsl-lite.hpp b/onnxruntime/gsl/gsl-lite.hpp
deleted file mode 100644
index e6191e98ee..0000000000
--- a/onnxruntime/gsl/gsl-lite.hpp
+++ /dev/null
@@ -1,2836 +0,0 @@
-//
-// gsl-lite is based on GSL: Guidelines Support Library.
-// For more information see https://github.com/martinmoene/gsl-lite
-//
-// Copyright (c) 2015-2018 Martin Moene
-// Copyright (c) 2015-2018 Microsoft Corporation. All rights reserved.
-//
-// This code is licensed under the MIT License (MIT).
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-#pragma once
-
-#ifndef GSL_GSL_LITE_HPP_INCLUDED
-#define GSL_GSL_LITE_HPP_INCLUDED
-
-#include <algorithm>
-#include <exception>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <iosfwd>
-#include <stdexcept>
-#include <string>
-#include <utility>
-#include <vector>
-
-#define gsl_lite_MAJOR 0
-#define gsl_lite_MINOR 35
-#define gsl_lite_PATCH 0
-
-#define gsl_lite_VERSION gsl_STRINGIFY(gsl_lite_MAJOR) "." gsl_STRINGIFY(gsl_lite_MINOR) "." gsl_STRINGIFY(gsl_lite_PATCH)
-
-// gsl-lite backward compatibility:
-
-#ifdef gsl_CONFIG_ALLOWS_SPAN_CONTAINER_CTOR
-#define gsl_CONFIG_ALLOWS_UNCONSTRAINED_SPAN_CONTAINER_CTOR gsl_CONFIG_ALLOWS_SPAN_CONTAINER_CTOR
-#pragma message("gsl_CONFIG_ALLOWS_SPAN_CONTAINER_CTOR is deprecated since gsl-lite 0.7.0; replace with gsl_CONFIG_ALLOWS_UNCONSTRAINED_SPAN_CONTAINER_CTOR, or consider span(with_container, cont).")
-#endif
-
-#if defined(gsl_CONFIG_CONTRACT_LEVEL_EXPECTS_ONLY)
-#pragma message("gsl_CONFIG_CONTRACT_LEVEL_EXPECTS_ONLY is deprecated since gsl-lite 0.35.0; replace with gsl_CONFIG_CONTRACT_LEVEL_ON and gsl_CONFIG_CONTRACT_EXPECTS_ONLY.")
-#define gsl_CONFIG_CONTRACT_LEVEL_ON
-#define gsl_CONFIG_CONTRACT_EXPECTS_ONLY
-#elif defined(gsl_CONFIG_CONTRACT_LEVEL_ENSURES_ONLY)
-#pragma message("gsl_CONFIG_CONTRACT_LEVEL_ENSURES_ONLY is deprecated since gsl-lite 0.35.0; replace with gsl_CONFIG_CONTRACT_LEVEL_ON and gsl_CONFIG_CONTRACT_ENSURES_ONLY.")
-#define gsl_CONFIG_CONTRACT_LEVEL_ON
-#define gsl_CONFIG_CONTRACT_ENSURES_ONLY
-#endif
-
-// M-GSL compatibility:
-
-#if defined(GSL_THROW_ON_CONTRACT_VIOLATION)
-#define gsl_CONFIG_CONTRACT_VIOLATION_THROWS 1
-#endif
-
-#if defined(GSL_TERMINATE_ON_CONTRACT_VIOLATION)
-#define gsl_CONFIG_CONTRACT_VIOLATION_TERMINATES 1
-#endif
-
-#if defined(GSL_UNENFORCED_ON_CONTRACT_VIOLATION)
-#define gsl_CONFIG_CONTRACT_LEVEL_OFF 1
-#endif
-
-// Configuration: Features
-
-#ifndef gsl_FEATURE_WITH_CONTAINER_TO_STD
-#define gsl_FEATURE_WITH_CONTAINER_TO_STD 99
-#endif
-
-#ifndef gsl_FEATURE_MAKE_SPAN_TO_STD
-#define gsl_FEATURE_MAKE_SPAN_TO_STD 99
-#endif
-
-#ifndef gsl_FEATURE_BYTE_SPAN_TO_STD
-#define gsl_FEATURE_BYTE_SPAN_TO_STD 99
-#endif
-
-#ifndef gsl_FEATURE_IMPLICIT_MACRO
-#define gsl_FEATURE_IMPLICIT_MACRO 0
-#endif
-
-#ifndef gsl_FEATURE_OWNER_MACRO
-#define gsl_FEATURE_OWNER_MACRO 1
-#endif
-
-#ifndef gsl_FEATURE_EXPERIMENTAL_RETURN_GUARD
-#define gsl_FEATURE_EXPERIMENTAL_RETURN_GUARD 0
-#endif
-
-// Configuration: Other
-
-#ifndef gsl_CONFIG_DEPRECATE_TO_LEVEL
-#define gsl_CONFIG_DEPRECATE_TO_LEVEL 0
-#endif
-
-#ifndef gsl_CONFIG_SPAN_INDEX_TYPE
-#define gsl_CONFIG_SPAN_INDEX_TYPE size_t
-#endif
-
-#ifndef gsl_CONFIG_NOT_NULL_EXPLICIT_CTOR
-#define gsl_CONFIG_NOT_NULL_EXPLICIT_CTOR 0
-#endif
-
-#ifndef gsl_CONFIG_NOT_NULL_GET_BY_CONST_REF
-#define gsl_CONFIG_NOT_NULL_GET_BY_CONST_REF 0
-#endif
-
-#ifndef gsl_CONFIG_CONFIRMS_COMPILATION_ERRORS
-#define gsl_CONFIG_CONFIRMS_COMPILATION_ERRORS 0
-#endif
-
-#ifndef gsl_CONFIG_ALLOWS_NONSTRICT_SPAN_COMPARISON
-#define gsl_CONFIG_ALLOWS_NONSTRICT_SPAN_COMPARISON 1
-#endif
-
-#ifndef gsl_CONFIG_ALLOWS_UNCONSTRAINED_SPAN_CONTAINER_CTOR
-#define gsl_CONFIG_ALLOWS_UNCONSTRAINED_SPAN_CONTAINER_CTOR 0
-#endif
-
-#if 2 <= defined(gsl_CONFIG_CONTRACT_LEVEL_AUDIT) + defined(gsl_CONFIG_CONTRACT_LEVEL_ON) + defined(gsl_CONFIG_CONTRACT_LEVEL_ASSUME) + defined(gsl_CONFIG_CONTRACT_LEVEL_OFF)
-#error only one of gsl_CONFIG_CONTRACT_LEVEL_AUDIT, gsl_CONFIG_CONTRACT_LEVEL_ON, gsl_CONFIG_CONTRACT_LEVEL_ASSUME, and gsl_CONFIG_CONTRACT_LEVEL_OFF may be defined.
-#elif defined(gsl_CONFIG_CONTRACT_LEVEL_AUDIT)
-#define gsl_CONFIG_CONTRACT_LEVEL_MASK_0 0x33
-#elif defined(gsl_CONFIG_CONTRACT_LEVEL_ON)
-#define gsl_CONFIG_CONTRACT_LEVEL_MASK_0 0x11
-#elif defined(gsl_CONFIG_CONTRACT_LEVEL_ASSUME)
-#define gsl_CONFIG_CONTRACT_LEVEL_MASK_0 0x44
-#elif defined(gsl_CONFIG_CONTRACT_LEVEL_OFF)
-#define gsl_CONFIG_CONTRACT_LEVEL_MASK_0 0x00
-#else
-#define gsl_CONFIG_CONTRACT_LEVEL_MASK_0 0x11
-#endif
-
-#if defined(gsl_CONFIG_CONTRACT_EXPECTS_ONLY)
-#define gsl_CONFIG_CONTRACT_LEVEL_MASK (gsl_CONFIG_CONTRACT_LEVEL_MASK_0 & 0x0F)
-#elif defined(gsl_CONFIG_CONTRACT_ENSURES_ONLY)
-#define gsl_CONFIG_CONTRACT_LEVEL_MASK (gsl_CONFIG_CONTRACT_LEVEL_MASK_0 & 0xF0)
-#else
-#define gsl_CONFIG_CONTRACT_LEVEL_MASK gsl_CONFIG_CONTRACT_LEVEL_MASK_0
-#endif
-
-#if 2 <= defined(gsl_CONFIG_CONTRACT_VIOLATION_THROWS) + defined(gsl_CONFIG_CONTRACT_VIOLATION_TERMINATES) + defined(gsl_CONFIG_CONTRACT_VIOLATION_CALLS_HANDLER)
-#error only one of gsl_CONFIG_CONTRACT_VIOLATION_THROWS, gsl_CONFIG_CONTRACT_VIOLATION_TERMINATES, and gsl_CONFIG_CONTRACT_VIOLATION_CALLS_HANDLER may be defined.
-#elif defined(gsl_CONFIG_CONTRACT_VIOLATION_THROWS)
-#define gsl_CONFIG_CONTRACT_VIOLATION_THROWS_V 1
-#define gsl_CONFIG_CONTRACT_VIOLATION_CALLS_HANDLER_V 0
-#elif defined(gsl_CONFIG_CONTRACT_VIOLATION_TERMINATES)
-#define gsl_CONFIG_CONTRACT_VIOLATION_THROWS_V 0
-#define gsl_CONFIG_CONTRACT_VIOLATION_CALLS_HANDLER_V 0
-#elif defined(gsl_CONFIG_CONTRACT_VIOLATION_CALLS_HANDLER)
-#define gsl_CONFIG_CONTRACT_VIOLATION_THROWS_V 0
-#define gsl_CONFIG_CONTRACT_VIOLATION_CALLS_HANDLER_V 1
-#else
-#define gsl_CONFIG_CONTRACT_VIOLATION_THROWS_V 0
-#define gsl_CONFIG_CONTRACT_VIOLATION_CALLS_HANDLER_V 0
-#endif
-
-#if defined(gsl_CONFIG_CONTRACT_LEVEL_ASSUME) && (defined(gsl_CONFIG_CONTRACT_VIOLATION_THROWS) || defined(gsl_CONFIG_CONTRACT_VIOLATION_TERMINATES) || defined(gsl_CONFIG_CONTRACT_VIOLATION_CALLS_HANDLER))
-// `gsl_CONFIG_CONTRACT_LEVEL_ASSUME` should not be combined with any of the violation
-// response macros. Contract violations are undefined behavior in ASSUME mode, and
-// code which expects a particular violation response will not work as expected.
-#error cannot define gsl_CONFIG_CONTRACT_VIOLATION_THROWS, gsl_CONFIG_CONTRACT_VIOLATION_TERMINATES, or gsl_CONFIG_CONTRACT_VIOLATION_CALLS_HANDLER if gsl_CONFIG_CONTRACT_LEVEL_ASSUME is defined.
-#endif
-
-// C++ language version detection (C++20 is speculative):
-// Note: VC14.0/1900 (VS2015) lacks too much from C++14.
-
-#ifndef gsl_CPLUSPLUS
-#if defined(_MSVC_LANG) && !defined(__clang__)
-#define gsl_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG)
-#else
-#define gsl_CPLUSPLUS __cplusplus
-#endif
-#endif
-
-#define gsl_CPP98_OR_GREATER (gsl_CPLUSPLUS >= 199711L)
-#define gsl_CPP11_OR_GREATER (gsl_CPLUSPLUS >= 201103L)
-#define gsl_CPP14_OR_GREATER (gsl_CPLUSPLUS >= 201402L)
-#define gsl_CPP17_OR_GREATER (gsl_CPLUSPLUS >= 201703L)
-#define gsl_CPP20_OR_GREATER (gsl_CPLUSPLUS >= 202000L)
-
-// C++ language version (represent 98 as 3):
-
-#define gsl_CPLUSPLUS_V (gsl_CPLUSPLUS / 100 - (gsl_CPLUSPLUS > 200000 ? 2000 : 1994))
-
-// half-open range [lo..hi):
-#define gsl_BETWEEN(v, lo, hi) ((lo) <= (v) && (v) < (hi))
-
-// Compiler versions:
-//
-// MSVC++ 6.0  _MSC_VER == 1200 (Visual Studio 6.0)
-// MSVC++ 7.0  _MSC_VER == 1300 (Visual Studio .NET 2002)
-// MSVC++ 7.1  _MSC_VER == 1310 (Visual Studio .NET 2003)
-// MSVC++ 8.0  _MSC_VER == 1400 (Visual Studio 2005)
-// MSVC++ 9.0  _MSC_VER == 1500 (Visual Studio 2008)
-// MSVC++ 10.0 _MSC_VER == 1600 (Visual Studio 2010)
-// MSVC++ 11.0 _MSC_VER == 1700 (Visual Studio 2012)
-// MSVC++ 12.0 _MSC_VER == 1800 (Visual Studio 2013)
-// MSVC++ 14.0 _MSC_VER == 1900 (Visual Studio 2015)
-// MSVC++ 14.1 _MSC_VER >= 1910 (Visual Studio 2017)
-
-#if defined(_MSC_VER) && !defined(__clang__)
-#define gsl_COMPILER_MSVC_VER (_MSC_VER)
-#define gsl_COMPILER_MSVC_VERSION (_MSC_VER / 10 - 10 * (5 + (_MSC_VER < 1900)))
-#else
-#define gsl_COMPILER_MSVC_VER 0
-#define gsl_COMPILER_MSVC_VERSION 0
-#endif
-
-#define gsl_COMPILER_VERSION(major, minor, patch) (10 * (10 * (major) + (minor)) + (patch))
-
-#if defined(__clang__)
-#define gsl_COMPILER_CLANG_VERSION gsl_COMPILER_VERSION(__clang_major__, __clang_minor__, __clang_patchlevel__)
-#else
-#define gsl_COMPILER_CLANG_VERSION 0
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__)
-#define gsl_COMPILER_GNUC_VERSION gsl_COMPILER_VERSION(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
-#else
-#define gsl_COMPILER_GNUC_VERSION 0
-#endif
-
-// Method enabling (C++98, VC120 (VS2013) cannot use __VA_ARGS__)
-
-#define gsl_REQUIRES_0(VA) \
-  template <bool B = (VA), typename std::enable_if<B, int>::type = 0>
-
-#define gsl_REQUIRES_T(VA) \
-  , typename = typename std::enable_if<(VA), gsl::detail::enabler>::type
-
-#define gsl_REQUIRES_R(R, VA) \
-  typename std::enable_if<VA, R>::type
-
-#define gsl_REQUIRES_A(VA) \
-  , typename std::enable_if<VA, void*>::type = nullptr
-
-// Compiler non-strict aliasing:
-
-#if defined(__clang__) || defined(__GNUC__)
-#define gsl_may_alias __attribute__((__may_alias__))
-#else
-#define gsl_may_alias
-#endif
-
-// Presence of gsl, language and library features:
-
-#define gsl_IN_STD(v) (((v) == 98 ? 3 : (v)) >= gsl_CPLUSPLUS_V)
-
-#define gsl_DEPRECATE_TO_LEVEL(level) (level <= gsl_CONFIG_DEPRECATE_TO_LEVEL)
-#define gsl_FEATURE_TO_STD(feature) (gsl_IN_STD(gsl_FEATURE(feature##_TO_STD)))
-#define gsl_FEATURE(feature) (gsl_FEATURE_##feature)
-#define gsl_CONFIG(feature) (gsl_CONFIG_##feature)
-#define gsl_HAVE(feature) (gsl_HAVE_##feature)
-
-// Presence of wide character support:
-
-#ifdef __DJGPP__
-#define gsl_HAVE_WCHAR 0
-#else
-#define gsl_HAVE_WCHAR 1
-#endif
-
-// Presence of language & library features:
-
-#ifdef _HAS_CPP0X
-#define gsl_HAS_CPP0X _HAS_CPP0X
-#else
-#define gsl_HAS_CPP0X 0
-#endif
-
-#define gsl_CPP11_100 (gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VER >= 1600)
-#define gsl_CPP11_110 (gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VER >= 1700)
-#define gsl_CPP11_120 (gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VER >= 1800)
-#define gsl_CPP11_140 (gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VER >= 1900)
-
-#define gsl_CPP14_000 (gsl_CPP14_OR_GREATER)
-#define gsl_CPP14_120 (gsl_CPP14_OR_GREATER || gsl_COMPILER_MSVC_VER >= 1800)
-#define gsl_CPP14_140 (gsl_CPP14_OR_GREATER || gsl_COMPILER_MSVC_VER >= 1900)
-
-#define gsl_CPP17_000 (gsl_CPP17_OR_GREATER)
-#define gsl_CPP17_140 (gsl_CPP17_OR_GREATER || gsl_COMPILER_MSVC_VER >= 1900)
-
-#define gsl_CPP11_140_CPP0X_90 (gsl_CPP11_140 || (gsl_COMPILER_MSVC_VER >= 1500 && gsl_HAS_CPP0X))
-#define gsl_CPP11_140_CPP0X_100 (gsl_CPP11_140 || (gsl_COMPILER_MSVC_VER >= 1600 && gsl_HAS_CPP0X))
-
-// Presence of C++11 language features:
-
-#define gsl_HAVE_AUTO gsl_CPP11_100
-#define gsl_HAVE_NULLPTR gsl_CPP11_100
-#define gsl_HAVE_RVALUE_REFERENCE gsl_CPP11_100
-
-#define gsl_HAVE_ENUM_CLASS gsl_CPP11_110
-
-#define gsl_HAVE_ALIAS_TEMPLATE gsl_CPP11_120
-#define gsl_HAVE_DEFAULT_FUNCTION_TEMPLATE_ARG gsl_CPP11_120
-#define gsl_HAVE_EXPLICIT gsl_CPP11_120
-#define gsl_HAVE_INITIALIZER_LIST gsl_CPP11_120
-
-#define gsl_HAVE_CONSTEXPR_11 gsl_CPP11_140
-#define gsl_HAVE_IS_DEFAULT gsl_CPP11_140
-#define gsl_HAVE_IS_DELETE gsl_CPP11_140
-#define gsl_HAVE_NOEXCEPT gsl_CPP11_140
-
-#if gsl_CPP11_OR_GREATER
-// see above
-#endif
-
-// Presence of C++14 language features:
-
-#define gsl_HAVE_CONSTEXPR_14 gsl_CPP14_000
-#define gsl_HAVE_DECLTYPE_AUTO gsl_CPP14_140
-
-// Presence of C++17 language features:
-// MSVC: template parameter deduction guides since Visual Studio 2017 v15.7
-
-#define gsl_HAVE_ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE gsl_CPP17_000
-#define gsl_HAVE_DEDUCTION_GUIDES (gsl_CPP17_000 && !gsl_BETWEEN(gsl_COMPILER_MSVC_VERSION, 1, 999))
-
-// Presence of C++ library features:
-
-#define gsl_HAVE_ADDRESSOF gsl_CPP17_000
-#define gsl_HAVE_ARRAY gsl_CPP11_110
-#define gsl_HAVE_TYPE_TRAITS gsl_CPP11_110
-#define gsl_HAVE_TR1_TYPE_TRAITS gsl_CPP11_110
-
-#define gsl_HAVE_CONTAINER_DATA_METHOD gsl_CPP11_140_CPP0X_90
-#define gsl_HAVE_STD_DATA gsl_CPP17_000
-
-#define gsl_HAVE_SIZED_TYPES gsl_CPP11_140
-
-#define gsl_HAVE_MAKE_SHARED gsl_CPP11_140_CPP0X_100
-#define gsl_HAVE_SHARED_PTR gsl_CPP11_140_CPP0X_100
-#define gsl_HAVE_UNIQUE_PTR gsl_CPP11_140_CPP0X_100
-
-#define gsl_HAVE_MAKE_UNIQUE gsl_CPP14_120
-
-#define gsl_HAVE_UNCAUGHT_EXCEPTIONS gsl_CPP17_140
-
-#define gsl_HAVE_ADD_CONST gsl_HAVE_TYPE_TRAITS
-#define gsl_HAVE_INTEGRAL_CONSTANT gsl_HAVE_TYPE_TRAITS
-#define gsl_HAVE_REMOVE_CONST gsl_HAVE_TYPE_TRAITS
-#define gsl_HAVE_REMOVE_REFERENCE gsl_HAVE_TYPE_TRAITS
-
-#define gsl_HAVE_TR1_ADD_CONST gsl_HAVE_TR1_TYPE_TRAITS
-#define gsl_HAVE_TR1_INTEGRAL_CONSTANT gsl_HAVE_TR1_TYPE_TRAITS
-#define gsl_HAVE_TR1_REMOVE_CONST gsl_HAVE_TR1_TYPE_TRAITS
-#define gsl_HAVE_TR1_REMOVE_REFERENCE gsl_HAVE_TR1_TYPE_TRAITS
-
-// C++ feature usage:
-
-#if gsl_HAVE(ADDRESSOF)
-#define gsl_ADDRESSOF(x) std::addressof(x)
-#else
-#define gsl_ADDRESSOF(x) (&x)
-#endif
-
-#if gsl_HAVE(CONSTEXPR_11)
-#define gsl_constexpr constexpr
-#else
-#define gsl_constexpr /*constexpr*/
-#endif
-
-#if gsl_HAVE(CONSTEXPR_14)
-#define gsl_constexpr14 constexpr
-#else
-#define gsl_constexpr14 /*constexpr*/
-#endif
-
-#if gsl_HAVE(EXPLICIT)
-#define gsl_explicit explicit
-#else
-#define gsl_explicit /*explicit*/
-#endif
-
-#if gsl_FEATURE(IMPLICIT_MACRO)
-#define implicit /*implicit*/
-#endif
-
-#if gsl_HAVE(IS_DELETE)
-#define gsl_is_delete = delete
-#else
-#define gsl_is_delete
-#endif
-
-#if gsl_HAVE(IS_DELETE)
-#define gsl_is_delete_access public
-#else
-#define gsl_is_delete_access private
-#endif
-
-#if !gsl_HAVE(NOEXCEPT) || gsl_CONFIG(CONTRACT_VIOLATION_THROWS_V)
-#define gsl_noexcept /*noexcept*/
-#else
-#define gsl_noexcept noexcept
-#endif
-
-#if gsl_HAVE(NULLPTR)
-#define gsl_nullptr nullptr
-#else
-#define gsl_nullptr NULL
-#endif
-
-#define gsl_DIMENSION_OF(a) (sizeof(a) / sizeof(0 [a]))
-
-// Other features:
-
-#define gsl_HAVE_CONSTRAINED_SPAN_CONTAINER_CTOR \
-  (gsl_HAVE_DEFAULT_FUNCTION_TEMPLATE_ARG && gsl_HAVE_CONTAINER_DATA_METHOD)
-
-// Note: !defined(__NVCC__) doesn't work with nvcc here:
-#define gsl_HAVE_UNCONSTRAINED_SPAN_CONTAINER_CTOR \
-  (gsl_CONFIG_ALLOWS_UNCONSTRAINED_SPAN_CONTAINER_CTOR && (__NVCC__ == 0))
-
-// GSL API (e.g. for CUDA platform):
-
-#ifndef gsl_api
-#ifdef __CUDACC__
-#define gsl_api __host__ __device__
-#else
-#define gsl_api /*gsl_api*/
-#endif
-#endif
-
-// Additional includes:
-
-#if gsl_HAVE(ARRAY)
-#include <array>
-#endif
-
-#if gsl_HAVE(INITIALIZER_LIST)
-#include <initializer_list>
-#endif
-
-#if gsl_HAVE(TYPE_TRAITS)
-#include <type_traits>
-#elif gsl_HAVE(TR1_TYPE_TRAITS)
-#include <tr1/type_traits>
-#endif
-
-#if gsl_HAVE(SIZED_TYPES)
-#include <cstdint>
-#endif
-
-// MSVC warning suppression macros:
-
-#if gsl_COMPILER_MSVC_VERSION >= 140
-#define gsl_SUPPRESS_MSGSL_WARNING(expr) [[gsl::suppress(expr)]]
-#define gsl_SUPPRESS_MSVC_WARNING(code, descr) __pragma(warning(suppress \
-                                                                : code))
-#define gsl_DISABLE_MSVC_WARNINGS(codes) __pragma(warning(push)) __pragma(warning(disable \
-                                                                                  : codes))
-#define gsl_RESTORE_MSVC_WARNINGS() __pragma(warning(pop))
-#else
-#define gsl_SUPPRESS_MSGSL_WARNING(expr)
-#define gsl_SUPPRESS_MSVC_WARNING(code, descr)
-#define gsl_DISABLE_MSVC_WARNINGS(codes)
-#define gsl_RESTORE_MSVC_WARNINGS()
-#endif
-
-// Suppress the following MSVC GSL warnings:
-// - C26410: gsl::r.32: the parameter 'ptr' is a reference to const unique pointer, use const T* or const T& instead
-// - C26415: gsl::r.30: smart pointer parameter 'ptr' is used only to access contained pointer. Use T* or T& instead
-// - C26418: gsl::r.36: shared pointer parameter 'ptr' is not copied or moved. Use T* or T& instead
-// - C26472: gsl::t.1 : don't use a static_cast for arithmetic conversions;
-//                      use brace initialization, gsl::narrow_cast or gsl::narow
-// - C26439: gsl::f.6 : special function 'function' can be declared 'noexcept'
-// - C26440: gsl::f.6 : function 'function' can be declared 'noexcept'
-// - C26473: gsl::t.1 : don't cast between pointer types where the source type and the target type are the same
-// - C26481: gsl::b.1 : don't use pointer arithmetic. Use span instead
-// - C26482: gsl::b.2 : only index into arrays using constant expressions
-// - C26446: gdl::b.4 : prefer to use gsl::at() instead of unchecked subscript operator
-// - C26490: gsl::t.1 : don't use reinterpret_cast
-// - C26487: gsl::l.4 : don't return a pointer '(<some number>'s result)' that may be invalid
-
-gsl_DISABLE_MSVC_WARNINGS(26410 26415 26418 26472 26439 26440 26473 26481 26482 26446 26490 26487)
-
-    namespace gsl {
-  // forward declare span<>:
-
-  template <class T>
-  class span;
-
-  // C++11 emulation:
-
-  namespace std11 {
-
-#if gsl_HAVE(ADD_CONST)
-
-  using std::add_const;
-
-#elif gsl_HAVE(TR1_ADD_CONST)
-
-  using std::tr1::add_const;
-
-#else
-
-  template <class T>
-  struct add_const { typedef const T type; };
-
-#endif  // gsl_HAVE( ADD_CONST )
-
-#if gsl_HAVE(REMOVE_CONST)
-
-  using std::remove_const;
-  using std::remove_cv;
-  using std::remove_volatile;
-
-#elif gsl_HAVE(TR1_REMOVE_CONST)
-
-  using std::tr1::remove_const;
-  using std::tr1::remove_cv;
-  using std::tr1::remove_volatile;
-
-#else
-
-  template <class T>
-  struct remove_const { typedef T type; };
-  template <class T>
-  struct remove_const<T const> { typedef T type; };
-
-  template <class T>
-  struct remove_volatile { typedef T type; };
-  template <class T>
-  struct remove_volatile<T volatile> { typedef T type; };
-
-  template <class T>
-  struct remove_cv {
-    typedef typename remove_volatile<typename remove_const<T>::type>::type type;
-  };
-
-#endif  // gsl_HAVE( REMOVE_CONST )
-
-#if gsl_HAVE(INTEGRAL_CONSTANT)
-
-  using std::false_type;
-  using std::integral_constant;
-  using std::true_type;
-
-#elif gsl_HAVE(TR1_INTEGRAL_CONSTANT)
-
-  using std::tr1::false_type;
-  using std::tr1::integral_constant;
-  using std::tr1::true_type;
-
-#else
-
-  template <class T, T v>
-  struct integral_constant {
-    enum { value = v };
-  };
-  typedef integral_constant<bool, true> true_type;
-  typedef integral_constant<bool, false> false_type;
-
-#endif
-
-  }  // namespace std11
-
-  // C++17 emulation:
-
-  namespace std17 {
-
-  template <bool v>
-  struct bool_constant : std11::integral_constant<bool, v> {};
-
-#if gsl_CPP11_120
-
-  template <class...>
-  using void_t = void;
-
-#endif
-
-#if gsl_HAVE(STD_DATA)
-
-  using std::data;
-  using std::size;
-
-#elif gsl_HAVE(CONSTRAINED_SPAN_CONTAINER_CTOR)
-
-  template <class T, size_t N>
-  inline gsl_constexpr auto size(const T (&)[N]) gsl_noexcept -> size_t {
-    return N;
-  }
-
-  template <class C>
-  inline gsl_constexpr auto size(C const& cont) -> decltype(cont.size()) {
-    return cont.size();
-  }
-
-  template <class T, size_t N>
-  inline gsl_constexpr auto data(T (&arr)[N]) gsl_noexcept -> T* {
-    return &arr[0];
-  }
-
-  template <class C>
-  inline gsl_constexpr auto data(C& cont) -> decltype(cont.data()) {
-    return cont.data();
-  }
-
-  template <class C>
-  inline gsl_constexpr auto data(C const& cont) -> decltype(cont.data()) {
-    return cont.data();
-  }
-
-  template <class E>
-  inline gsl_constexpr auto data(std::initializer_list<E> il) gsl_noexcept -> E const* {
-    return il.begin();
-  }
-
-#endif  // span_HAVE( DATA )
-
-  }  // namespace std17
-
-  namespace detail {
-
-  /// for nsel_REQUIRES_T
-
-  /*enum*/ class enabler {};
-
-#if gsl_HAVE(TYPE_TRAITS)
-
-  template <class Q>
-  struct is_span_oracle : std::false_type {};
-
-  template <class T>
-  struct is_span_oracle<span<T> > : std::true_type {};
-
-  template <class Q>
-  struct is_span : is_span_oracle<typename std::remove_cv<Q>::type> {};
-
-  template <class Q>
-  struct is_std_array_oracle : std::false_type {};
-
-#if gsl_HAVE(ARRAY)
-
-  template <class T, std::size_t Extent>
-  struct is_std_array_oracle<std::array<T, Extent> > : std::true_type {};
-
-#endif
-
-  template <class Q>
-  struct is_std_array : is_std_array_oracle<typename std::remove_cv<Q>::type> {};
-
-  template <class Q>
-  struct is_array : std::false_type {};
-
-  template <class T>
-  struct is_array<T[]> : std::true_type {};
-
-  template <class T, std::size_t N>
-  struct is_array<T[N]> : std::true_type {};
-
-#if gsl_CPP11_140 && !gsl_BETWEEN(gsl_COMPILER_GNUC_VERSION, 1, 500)
-
-  template <class, class = void>
-  struct has_size_and_data : std::false_type {};
-
-  template <class C>
-  struct has_size_and_data<
-      C, std17::void_t<
-             decltype(std17::size(std::declval<C>())),
-             decltype(std17::data(std::declval<C>()))> > : std::true_type {};
-
-  template <class, class, class = void>
-  struct is_compatible_element : std::false_type {};
-
-  template <class C, class E>
-  struct is_compatible_element<
-      C, E, std17::void_t<decltype(std17::data(std::declval<C>()))> > : std::is_convertible<typename std::remove_pointer<decltype(std17::data(std::declval<C&>()))>::type (*)[], E (*)[]> {};
-
-  template <class C>
-  struct is_container : std17::bool_constant<
-                            !is_span<C>::value && !is_array<C>::value && !is_std_array<C>::value && has_size_and_data<C>::value> {};
-
-  template <class C, class E>
-  struct is_compatible_container : std17::bool_constant<
-                                       is_container<C>::value && is_compatible_element<C, E>::value> {};
-
-#else  // gsl_CPP11_140
-
-  template <
-      class C, class E gsl_REQUIRES_T((!is_span<C>::value && !is_array<C>::value && !is_std_array<C>::value && (std::is_convertible<typename std::remove_pointer<decltype(std17::data(std::declval<C&>()))>::type (*)[], E (*)[]>::value)
-                                       //  &&   has_size_and_data< C >::value
-                                       )),
-      class = decltype(std17::size(std::declval<C>())), class = decltype(std17::data(std::declval<C>()))>
-  struct is_compatible_container : std::true_type {};
-
-#endif  // gsl_CPP11_140
-
-#endif  // gsl_HAVE( TYPE_TRAITS )
-
-  }  // namespace detail
-
-  //
-  // GSL.util: utilities
-  //
-
-  // index type for all container indexes/subscripts/sizes
-  typedef gsl_CONFIG_SPAN_INDEX_TYPE index;  // p0122r3 uses std::ptrdiff_t
-
-//
-// GSL.owner: ownership pointers
-//
-#if gsl_HAVE(SHARED_PTR)
-  using std::make_shared;
-  using std::shared_ptr;
-  using std::unique_ptr;
-#if gsl_HAVE(MAKE_UNIQUE)
-  using std::make_unique;
-#endif
-#endif
-
-#if gsl_HAVE(ALIAS_TEMPLATE)
-#if gsl_HAVE(TYPE_TRAITS)
-  template <class T
-                gsl_REQUIRES_T(std::is_pointer<T>::value)>
-  using owner = T;
-#else
-  template <class T>
-  using owner = T;
-#endif
-#else
-  template <class T>
-  struct owner { typedef T type; };
-#endif
-
-#define gsl_HAVE_OWNER_TEMPLATE gsl_HAVE_ALIAS_TEMPLATE
-
-#if gsl_FEATURE(OWNER_MACRO)
-#if gsl_HAVE(OWNER_TEMPLATE)
-#define Owner(t) ::gsl::owner<t>
-#else
-#define Owner(t) ::gsl::owner<t>::type
-#endif
-#endif
-
-  //
-  // GSL.assert: assertions
-  //
-
-#if gsl_HAVE(TYPE_TRAITS)
-#define gsl_ELIDE_CONTRACT(x) static_assert(::std::is_convertible<decltype((x)), bool>::value, "argument of contract check must be convertible to bool")
-#else
-#define gsl_ELIDE_CONTRACT(x)
-#endif
-
-#if defined(__CUDACC__) && defined(__CUDA_ARCH__)
-#define gsl_ASSUME(x) gsl_ELIDE_CONTRACT(x) /* there is no assume intrinsic in CUDA device code */
-#elif gsl_COMPILER_MSVC_VERSION
-#define gsl_ASSUME(x) __assume(x)
-#elif gsl_COMPILER_GNUC_VERSION
-#define gsl_ASSUME(x) ((x) ? static_cast<void>(0) : __builtin_unreachable())
-#elif defined(__has_builtin)
-#if __has_builtin(__builtin_unreachable)
-#define gsl_ASSUME(x) ((x) ? static_cast<void>(0) : __builtin_unreachable())
-#endif
-#else
-#define gsl_ASSUME(x) gsl_ELIDE_CONTRACT(x) /* unknown compiler; cannot rely on assume intrinsic */
-#endif
-
-#define gsl_ELIDE_CONTRACT_EXPECTS (0 == (gsl_CONFIG_CONTRACT_LEVEL_MASK & 0x01))
-#define gsl_ELIDE_CONTRACT_ENSURES (0 == (gsl_CONFIG_CONTRACT_LEVEL_MASK & 0x10))
-#define gsl_ASSUME_CONTRACT_EXPECTS (0 != (gsl_CONFIG_CONTRACT_LEVEL_MASK & 0x04))
-#define gsl_ASSUME_CONTRACT_ENSURES (0 != (gsl_CONFIG_CONTRACT_LEVEL_MASK & 0x40))
-#define gsl_ELIDE_CONTRACT_EXPECTS_AUDIT (0 == (gsl_CONFIG_CONTRACT_LEVEL_MASK & 0x02))
-#define gsl_ELIDE_CONTRACT_ENSURES_AUDIT (0 == (gsl_CONFIG_CONTRACT_LEVEL_MASK & 0x20))
-
-#if gsl_ELIDE_CONTRACT_EXPECTS
-#if gsl_ASSUME_CONTRACT_EXPECTS
-#define Expects(x) gsl_ASSUME(x)
-#else
-#define Expects(x) gsl_ELIDE_CONTRACT(x)
-#endif
-#elif gsl_CONFIG(CONTRACT_VIOLATION_THROWS_V)
-#define Expects(x) ::gsl::fail_fast_assert((x), "GSL: Precondition failure at " __FILE__ ":" gsl_STRINGIFY(__LINE__))
-#elif gsl_CONFIG(CONTRACT_VIOLATION_CALLS_HANDLER_V)
-#define Expects(x) ::gsl::fail_fast_assert((x), #x, "GSL: Precondition failure", __FILE__, __LINE__)
-#else
-#define Expects(x) ::gsl::fail_fast_assert((x))
-#endif
-
-#if gsl_ELIDE_CONTRACT_EXPECTS_AUDIT
-#define ExpectsAudit(x) gsl_ELIDE_CONTRACT(x)
-#elif gsl_CONFIG(CONTRACT_VIOLATION_THROWS_V)
-#define ExpectsAudit(x) ::gsl::fail_fast_assert((x), "GSL: Precondition failure at " __FILE__ ":" gsl_STRINGIFY(__LINE__))
-#elif gsl_CONFIG(CONTRACT_VIOLATION_CALLS_HANDLER_V)
-#define ExpectsAudit(x) ::gsl::fail_fast_assert((x), #x, "GSL: Precondition failure", __FILE__, __LINE__)
-#else
-#define ExpectsAudit(x) ::gsl::fail_fast_assert((x))
-#endif
-
-#if gsl_ELIDE_CONTRACT_ENSURES
-#if gsl_ASSUME_CONTRACT_ENSURES
-#define Ensures(x) gsl_ASSUME(x)
-#else
-#define Ensures(x) gsl_ELIDE_CONTRACT(x)
-#endif
-#elif gsl_CONFIG(CONTRACT_VIOLATION_THROWS_V)
-#define Ensures(x) ::gsl::fail_fast_assert((x), "GSL: Postcondition failure at " __FILE__ ":" gsl_STRINGIFY(__LINE__))
-#elif gsl_CONFIG(CONTRACT_VIOLATION_CALLS_HANDLER_V)
-#define Ensures(x) ::gsl::fail_fast_assert((x), #x, "GSL: Postcondition failure", __FILE__, __LINE__)
-#else
-#define Ensures(x) ::gsl::fail_fast_assert((x))
-#endif
-
-#if gsl_ELIDE_CONTRACT_ENSURES_AUDIT
-#define EnsuresAudit(x) gsl_ELIDE_CONTRACT(x)
-#elif gsl_CONFIG(CONTRACT_VIOLATION_THROWS_V)
-#define EnsuresAudit(x) ::gsl::fail_fast_assert((x), "GSL: Postcondition failure at " __FILE__ ":" gsl_STRINGIFY(__LINE__))
-#elif gsl_CONFIG(CONTRACT_VIOLATION_CALLS_HANDLER_V)
-#define EnsuresAudit(x) ::gsl::fail_fast_assert((x), #x, "GSL: Postcondition failure", __FILE__, __LINE__)
-#else
-#define EnsuresAudit(x) ::gsl::fail_fast_assert((x))
-#endif
-
-#define gsl_STRINGIFY(x) gsl_STRINGIFY_(x)
-#define gsl_STRINGIFY_(x) #x
-
-  struct fail_fast : public std::logic_error {
-    gsl_api explicit fail_fast(char const* const message)
-        : std::logic_error(message) {}
-  };
-
-#if gsl_CONFIG(CONTRACT_VIOLATION_THROWS_V)
-
-  gsl_api inline void fail_fast_assert(bool cond, char const* const message) {
-#ifdef __CUDA_ARCH__
-    assert(cond);
-#else   // __CUDA_ARCH__
-    if (!cond)
-      throw fail_fast(message);
-#endif  // __CUDA_ARCH__
-  }
-
-#elif gsl_CONFIG(CONTRACT_VIOLATION_CALLS_HANDLER_V)
-
-  // Should be defined by user
-  gsl_api void fail_fast_assert_handler(char const* const expression, char const* const message, char const* const file, int line);
-
-  gsl_api inline void fail_fast_assert(bool cond, char const* const expression, char const* const message, char const* const file, int line) {
-#ifdef __CUDA_ARCH__
-    assert(cond);
-#else   // __CUDA_ARCH__
-    if (!cond)
-      fail_fast_assert_handler(expression, message, file, line);
-#endif  // __CUDA_ARCH__
-  }
-
-#else
-
-  gsl_api inline void fail_fast_assert(bool cond) gsl_noexcept {
-#ifdef __CUDA_ARCH__
-    assert(cond);
-#else   // __CUDA_ARCH__
-    if (!cond)
-      std::terminate();
-#endif  // __CUDA_ARCH__
-  }
-
-#endif
-
-  //
-  // GSL.util: utilities
-  //
-
-#if gsl_FEATURE(EXPERIMENTAL_RETURN_GUARD)
-
-  // Add uncaught_exceptions for pre-2017 MSVC, GCC and Clang
-  // Return unsigned char to save stack space, uncaught_exceptions can only increase by 1 in a scope
-
-  namespace detail {
-
-  inline unsigned char to_uchar(unsigned x) gsl_noexcept {
-    return static_cast<unsigned char>(x);
-  }
-
-  }  // namespace detail
-
-  namespace std11 {
-
-#if gsl_HAVE(UNCAUGHT_EXCEPTIONS)
-
-  inline unsigned char uncaught_exceptions() gsl_noexcept {
-    return detail::to_uchar(std::uncaught_exceptions());
-  }
-
-#elif gsl_COMPILER_MSVC_VERSION
-
-  extern "C" char* __cdecl _getptd();
-  inline unsigned char uncaught_exceptions() gsl_noexcept {
-    return detail::to_uchar(*reinterpret_cast<unsigned*>(_getptd() + (sizeof(void*) == 8 ? 0x100 : 0x90)));
-  }
-
-#elif gsl_COMPILER_CLANG_VERSION || gsl_COMPILER_GNUC_VERSION
-
-  extern "C" char* __cxa_get_globals();
-  inline unsigned char uncaught_exceptions() gsl_noexcept {
-    return detail::to_uchar(*reinterpret_cast<unsigned*>(__cxa_get_globals() + sizeof(void*)));
-  }
-#endif
-  }  // namespace std11
-#endif
-
-#if gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 110
-
-  template <class F>
-  class final_action {
-   public:
-    gsl_api explicit final_action(F action) gsl_noexcept
-        : action_(std::move(action)),
-          invoke_(true) {}
-
-    gsl_api final_action(final_action&& other) gsl_noexcept
-        : action_(std::move(other.action_)),
-          invoke_(other.invoke_) {
-      other.invoke_ = false;
-    }
-
-    gsl_api virtual ~final_action() gsl_noexcept {
-      if (invoke_)
-        action_();
-    }
-
-    gsl_is_delete_access : gsl_api final_action(final_action const&) gsl_is_delete;
-    gsl_api final_action& operator=(final_action const&) gsl_is_delete;
-    gsl_api final_action& operator=(final_action&&) gsl_is_delete;
-
-   protected:
-    gsl_api void dismiss() gsl_noexcept {
-      invoke_ = false;
-    }
-
-   private:
-    F action_;
-    bool invoke_;
-  };
-
-  template <class F>
-  gsl_api inline final_action<F> finally(F const& action) gsl_noexcept {
-    return final_action<F>(action);
-  }
-
-  template <class F>
-  gsl_api inline final_action<F> finally(F && action) gsl_noexcept {
-    return final_action<F>(std::forward<F>(action));
-  }
-
-#if gsl_FEATURE(EXPERIMENTAL_RETURN_GUARD)
-
-  template <class F>
-  class final_action_return : public final_action<F> {
-   public:
-    gsl_api explicit final_action_return(F&& action) gsl_noexcept
-        : final_action<F>(std::move(action)),
-          exception_count(std11::uncaught_exceptions()) {}
-
-    gsl_api final_action_return(final_action_return&& other) gsl_noexcept
-        : final_action<F>(std::move(other)),
-          exception_count(std11::uncaught_exceptions()) {}
-
-    gsl_api ~final_action_return() override {
-      if (std11::uncaught_exceptions() != exception_count)
-        this->dismiss();
-    }
-
-    gsl_is_delete_access : gsl_api final_action_return(final_action_return const&) gsl_is_delete;
-    gsl_api final_action_return& operator=(final_action_return const&) gsl_is_delete;
-
-   private:
-    unsigned char exception_count;
-  };
-
-  template <class F>
-  gsl_api inline final_action_return<F> on_return(F const& action) gsl_noexcept {
-    return final_action_return<F>(action);
-  }
-
-  template <class F>
-  gsl_api inline final_action_return<F> on_return(F && action) gsl_noexcept {
-    return final_action_return<F>(std::forward<F>(action));
-  }
-
-  template <class F>
-  class final_action_error : public final_action<F> {
-   public:
-    gsl_api explicit final_action_error(F&& action) gsl_noexcept
-        : final_action<F>(std::move(action)),
-          exception_count(std11::uncaught_exceptions()) {}
-
-    gsl_api final_action_error(final_action_error&& other) gsl_noexcept
-        : final_action<F>(std::move(other)),
-          exception_count(std11::uncaught_exceptions()) {}
-
-    gsl_api ~final_action_error() override {
-      if (std11::uncaught_exceptions() == exception_count)
-        this->dismiss();
-    }
-
-    gsl_is_delete_access : gsl_api final_action_error(final_action_error const&) gsl_is_delete;
-    gsl_api final_action_error& operator=(final_action_error const&) gsl_is_delete;
-
-   private:
-    unsigned char exception_count;
-  };
-
-  template <class F>
-  gsl_api inline final_action_error<F> on_error(F const& action) gsl_noexcept {
-    return final_action_error<F>(action);
-  }
-
-  template <class F>
-  gsl_api inline final_action_error<F> on_error(F && action) gsl_noexcept {
-    return final_action_error<F>(std::forward<F>(action));
-  }
-
-#endif  // gsl_FEATURE( EXPERIMENTAL_RETURN_GUARD )
-
-#else  // gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 110
-
-  class final_action {
-   public:
-    typedef void (*Action)();
-
-    gsl_api final_action(Action action)
-        : action_(action), invoke_(true) {}
-
-    gsl_api final_action(final_action const& other)
-        : action_(other.action_), invoke_(other.invoke_) {
-      other.invoke_ = false;
-    }
-
-    gsl_api virtual ~final_action() {
-      if (invoke_)
-        action_();
-    }
-
-   protected:
-    gsl_api void dismiss() {
-      invoke_ = false;
-    }
-
-   private:
-    gsl_api final_action& operator=(final_action const&);
-
-   private:
-    Action action_;
-    mutable bool invoke_;
-  };
-
-  template <class F>
-  gsl_api inline final_action finally(F const& f) {
-    return final_action((f));
-  }
-
-#if gsl_FEATURE(EXPERIMENTAL_RETURN_GUARD)
-
-  class final_action_return : public final_action {
-   public:
-    gsl_api explicit final_action_return(Action action)
-        : final_action(action), exception_count(std11::uncaught_exceptions()) {}
-
-    gsl_api ~final_action_return() {
-      if (std11::uncaught_exceptions() != exception_count)
-        this->dismiss();
-    }
-
-   private:
-    gsl_api final_action_return& operator=(final_action_return const&);
-
-   private:
-    unsigned char exception_count;
-  };
-
-  template <class F>
-  gsl_api inline final_action_return on_return(F const& action) {
-    return final_action_return(action);
-  }
-
-  class final_action_error : public final_action {
-   public:
-    gsl_api explicit final_action_error(Action action)
-        : final_action(action), exception_count(std11::uncaught_exceptions()) {}
-
-    gsl_api ~final_action_error() {
-      if (std11::uncaught_exceptions() == exception_count)
-        this->dismiss();
-    }
-
-   private:
-    gsl_api final_action_error& operator=(final_action_error const&);
-
-   private:
-    unsigned char exception_count;
-  };
-
-  template <class F>
-  gsl_api inline final_action_error on_error(F const& action) {
-    return final_action_error(action);
-  }
-
-#endif  // gsl_FEATURE( EXPERIMENTAL_RETURN_GUARD )
-
-#endif  // gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION == 110
-
-#if gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 120
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr T narrow_cast(U && u) gsl_noexcept {
-    return static_cast<T>(std::forward<U>(u));
-  }
-
-#else
-
-  template <class T, class U>
-  gsl_api inline T narrow_cast(U u) gsl_noexcept {
-    return static_cast<T>(u);
-  }
-
-#endif  // gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 120
-
-  struct narrowing_error : public std::exception {};
-
-#if gsl_HAVE(TYPE_TRAITS)
-
-  namespace detail {
-  template <class T, class U>
-  struct is_same_signedness : public std::integral_constant<bool, std::is_signed<T>::value == std::is_signed<U>::value> {};
-  }  // namespace detail
-#endif
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4127)
-#endif
-  template <class T, class U>
-  gsl_api inline T narrow(U u) {
-    T t = narrow_cast<T>(u);
-
-    if (static_cast<U>(t) != u) {
-#if gsl_CONFIG(CONTRACT_VIOLATION_THROWS_V)
-      throw narrowing_error();
-#else
-      std::terminate();
-#endif
-    }
-
-#if gsl_HAVE(TYPE_TRAITS)
-    if (!detail::is_same_signedness<T, U>::value && ((t < T()) != (u < U())))
-#else
-    // Don't assume T() works:
-    if ((t < 0) != (u < 0))
-#endif
-    {
-#if gsl_CONFIG(CONTRACT_VIOLATION_THROWS_V)
-      throw narrowing_error();
-#else
-      std::terminate();
-#endif
-    }
-    return t;
-  }
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-  //
-  // at() - Bounds-checked way of accessing static arrays, std::array, std::vector.
-  //
-
-  template <class T, size_t N>
-  gsl_api inline gsl_constexpr14 T& at(T(&arr)[N], size_t pos) {
-    Expects(pos < N);
-    return arr[pos];
-  }
-
-  template <class Container>
-  gsl_api inline gsl_constexpr14 typename Container::value_type& at(Container & cont, size_t pos) {
-    Expects(pos < cont.size());
-    return cont[pos];
-  }
-
-  template <class Container>
-  gsl_api inline gsl_constexpr14 typename Container::value_type const& at(Container const& cont, size_t pos) {
-    Expects(pos < cont.size());
-    return cont[pos];
-  }
-
-#if gsl_HAVE(INITIALIZER_LIST)
-
-  template <class T>
-  gsl_api inline const gsl_constexpr14 T at(std::initializer_list<T> cont, size_t pos) {
-    Expects(pos < cont.size());
-    return *(cont.begin() + pos);
-  }
-#endif
-
-  template <class T>
-  gsl_api inline gsl_constexpr T& at(span<T> s, size_t pos) {
-    return s.at(pos);
-  }
-
-  //
-  // GSL.views: views
-  //
-
-  //
-  // not_null<> - Wrap any indirection and enforce non-null.
-  //
-  template <class T>
-  class not_null {
-#if gsl_CONFIG(NOT_NULL_EXPLICIT_CTOR)
-#define gsl_not_null_explicit explicit
-#else
-#define gsl_not_null_explicit /*explicit*/
-#endif
-
-#if gsl_CONFIG(NOT_NULL_GET_BY_CONST_REF)
-    typedef T const& get_result_t;
-#else
-    typedef T get_result_t;
-#endif
-
-   public:
-#if gsl_HAVE(TYPE_TRAITS)
-    static_assert(std::is_assignable<T&, std::nullptr_t>::value, "T cannot be assigned nullptr.");
-#endif
-
-    template <class U
-#if gsl_HAVE(DEFAULT_FUNCTION_TEMPLATE_ARG)
-                  gsl_REQUIRES_T((std::is_constructible<T, U>::value))
-#endif
-              >
-    gsl_api gsl_constexpr14 gsl_not_null_explicit
-#if gsl_HAVE(RVALUE_REFERENCE)
-    not_null(U&& u)
-        : ptr_(std::forward<U>(u))
-#else
-    not_null(U const& u)
-        : ptr_(u)
-#endif
-    {
-      Expects(ptr_ != gsl_nullptr);
-    }
-#undef gsl_not_null_explicit
-
-#if gsl_HAVE(IS_DEFAULT)
-    ~not_null() = default;
-    gsl_constexpr not_null(not_null&& other) = default;
-    gsl_constexpr not_null(not_null const& other) = default;
-    not_null& operator=(not_null&& other) = default;
-    not_null& operator=(not_null const& other) = default;
-#else
-    gsl_api ~not_null(){};
-    gsl_api gsl_constexpr not_null(not_null const& other) : ptr_(other.ptr_) {}
-    gsl_api not_null& operator=(not_null const& other) {
-      ptr_ = other.ptr_;
-      return *this;
-    }
-#if gsl_HAVE(RVALUE_REFERENCE)
-    gsl_api gsl_constexpr not_null(not_null&& other) : ptr_(std::move(other.get())) {}
-    gsl_api not_null& operator=(not_null&& other) {
-      ptr_ = std::move(other.get());
-      return *this;
-    }
-#endif
-#endif
-
-    template <class U
-#if gsl_HAVE(DEFAULT_FUNCTION_TEMPLATE_ARG)
-                  gsl_REQUIRES_T((std::is_convertible<U, T>::value))
-#endif
-              >
-    gsl_api gsl_constexpr not_null(not_null<U> const& other)
-        : ptr_(other.get()) {
-    }
-
-    gsl_api gsl_constexpr14 get_result_t get() const {
-      // Without cheating and changing ptr_ from the outside, this check is superfluous:
-      Ensures(ptr_ != gsl_nullptr);
-      return ptr_;
-    }
-
-    gsl_api gsl_constexpr operator get_result_t() const { return get(); }
-    gsl_api gsl_constexpr get_result_t operator->() const { return get(); }
-
-#if gsl_HAVE(DECLTYPE_AUTO)
-    gsl_api gsl_constexpr decltype(auto) operator*() const { return *get(); }
-#endif
-
-    gsl_is_delete_access :
-    // prevent compilation when initialized with a nullptr or literal 0:
-#if gsl_HAVE(NULLPTR)
-        gsl_api not_null(std::nullptr_t) gsl_is_delete;
-    gsl_api not_null& operator=(std::nullptr_t) gsl_is_delete;
-#else
-        gsl_api
-        not_null(int) gsl_is_delete;
-    gsl_api not_null& operator=(int) gsl_is_delete;
-#endif
-
-    // unwanted operators...pointers only point to single objects!
-    gsl_api not_null& operator++() gsl_is_delete;
-    gsl_api not_null& operator--() gsl_is_delete;
-    gsl_api not_null operator++(int) gsl_is_delete;
-    gsl_api not_null operator--(int) gsl_is_delete;
-    gsl_api not_null& operator+(size_t) gsl_is_delete;
-    gsl_api not_null& operator+=(size_t) gsl_is_delete;
-    gsl_api not_null& operator-(size_t) gsl_is_delete;
-    gsl_api not_null& operator-=(size_t) gsl_is_delete;
-    gsl_api not_null& operator+=(std::ptrdiff_t) gsl_is_delete;
-    gsl_api not_null& operator-=(std::ptrdiff_t) gsl_is_delete;
-    gsl_api void operator[](std::ptrdiff_t) const gsl_is_delete;
-
-   private:
-    T ptr_;
-  };
-
-  // not_null with implicit constructor, allowing copy-initialization:
-
-  template <class T>
-  class not_null_ic : public not_null<T> {
-   public:
-    template <class U
-#if gsl_HAVE(DEFAULT_FUNCTION_TEMPLATE_ARG)
-                  gsl_REQUIRES_T((std::is_constructible<T, U>::value))
-#endif
-              >
-    gsl_api gsl_constexpr14
-#if gsl_HAVE(RVALUE_REFERENCE)
-    not_null_ic(U&& u)
-        : not_null<T>(std::forward<U>(u))
-#else
-    not_null_ic(U const& u)
-        : not_null<T>(u)
-#endif
-    {
-    }
-  };
-
-  // more not_null unwanted operators
-
-  template <class T, class U>
-  std::ptrdiff_t operator-(not_null<T> const&, not_null<U> const&) gsl_is_delete;
-
-  template <class T>
-  not_null<T> operator-(not_null<T> const&, std::ptrdiff_t) gsl_is_delete;
-
-  template <class T>
-  not_null<T> operator+(not_null<T> const&, std::ptrdiff_t) gsl_is_delete;
-
-  template <class T>
-  not_null<T> operator+(std::ptrdiff_t, not_null<T> const&) gsl_is_delete;
-
-  // not_null comparisons
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr bool operator==(not_null<T> const& l, not_null<U> const& r) {
-    return l.get() == r.get();
-  }
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr bool operator<(not_null<T> const& l, not_null<U> const& r) {
-    return l.get() < r.get();
-  }
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr bool operator!=(not_null<T> const& l, not_null<U> const& r) {
-    return !(l == r);
-  }
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr bool operator<=(not_null<T> const& l, not_null<U> const& r) {
-    return !(r < l);
-  }
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr bool operator>(not_null<T> const& l, not_null<U> const& r) {
-    return (r < l);
-  }
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr bool operator>=(not_null<T> const& l, not_null<U> const& r) {
-    return !(l < r);
-  }
-
-//
-// Byte-specific type.
-//
-#if gsl_HAVE(ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE)
-  enum class gsl_may_alias byte : unsigned char {};
-#else
-  struct gsl_may_alias byte {
-    typedef unsigned char type;
-    type v;
-  };
-#endif
-
-#if gsl_HAVE(DEFAULT_FUNCTION_TEMPLATE_ARG)
-#define gsl_ENABLE_IF_INTEGRAL_T(T) \
-  gsl_REQUIRES_T((std::is_integral<T>::value))
-#else
-#define gsl_ENABLE_IF_INTEGRAL_T(T)
-#endif
-
-  template <class T>
-  gsl_api inline gsl_constexpr byte to_byte(T v) gsl_noexcept {
-#if gsl_HAVE(ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE)
-    return static_cast<byte>(v);
-#elif gsl_HAVE(CONSTEXPR_11)
-    return {static_cast<typename byte::type>(v)};
-#else
-    byte b = {static_cast<typename byte::type>(v)};
-    return b;
-#endif
-  }
-
-  template <class IntegerType gsl_ENABLE_IF_INTEGRAL_T(IntegerType)>
-  gsl_api inline gsl_constexpr IntegerType to_integer(byte b) gsl_noexcept {
-#if gsl_HAVE(ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE)
-    return static_cast<typename std::underlying_type<byte>::type>(b);
-#else
-    return b.v;
-#endif
-  }
-
-  gsl_api inline gsl_constexpr unsigned char to_uchar(byte b) gsl_noexcept {
-    return to_integer<unsigned char>(b);
-  }
-
-  gsl_api inline gsl_constexpr unsigned char to_uchar(int i) gsl_noexcept {
-    return static_cast<unsigned char>(i);
-  }
-
-#if !gsl_HAVE(ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE)
-
-  gsl_api inline gsl_constexpr bool operator==(byte l, byte r) gsl_noexcept {
-    return l.v == r.v;
-  }
-
-  gsl_api inline gsl_constexpr bool operator!=(byte l, byte r) gsl_noexcept {
-    return !(l == r);
-  }
-
-  gsl_api inline gsl_constexpr bool operator<(byte l, byte r) gsl_noexcept {
-    return l.v < r.v;
-  }
-
-  gsl_api inline gsl_constexpr bool operator<=(byte l, byte r) gsl_noexcept {
-    return !(r < l);
-  }
-
-  gsl_api inline gsl_constexpr bool operator>(byte l, byte r) gsl_noexcept {
-    return (r < l);
-  }
-
-  gsl_api inline gsl_constexpr bool operator>=(byte l, byte r) gsl_noexcept {
-    return !(l < r);
-  }
-#endif
-
-  template <class IntegerType gsl_ENABLE_IF_INTEGRAL_T(IntegerType)>
-  gsl_api inline gsl_constexpr14 byte& operator<<=(byte& b, IntegerType shift) gsl_noexcept {
-#if gsl_HAVE(ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE)
-    return b = to_byte(to_uchar(b) << shift);
-#else
-    b.v = to_uchar(b.v << shift);
-    return b;
-#endif
-  }
-
-  template <class IntegerType gsl_ENABLE_IF_INTEGRAL_T(IntegerType)>
-  gsl_api inline gsl_constexpr byte operator<<(byte b, IntegerType shift) gsl_noexcept {
-    return to_byte(to_uchar(b) << shift);
-  }
-
-  template <class IntegerType gsl_ENABLE_IF_INTEGRAL_T(IntegerType)>
-  gsl_api inline gsl_constexpr14 byte& operator>>=(byte& b, IntegerType shift) gsl_noexcept {
-#if gsl_HAVE(ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE)
-    return b = to_byte(to_uchar(b) >> shift);
-#else
-    b.v = to_uchar(b.v >> shift);
-    return b;
-#endif
-  }
-
-  template <class IntegerType gsl_ENABLE_IF_INTEGRAL_T(IntegerType)>
-  gsl_api inline gsl_constexpr byte operator>>(byte b, IntegerType shift) gsl_noexcept {
-    return to_byte(to_uchar(b) >> shift);
-  }
-
-  gsl_api inline gsl_constexpr14 byte& operator|=(byte& l, byte r) gsl_noexcept {
-#if gsl_HAVE(ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE)
-    return l = to_byte(to_uchar(l) | to_uchar(r));
-#else
-    l.v = to_uchar(l) | to_uchar(r);
-    return l;
-#endif
-  }
-
-  gsl_api inline gsl_constexpr byte operator|(byte l, byte r) gsl_noexcept {
-    return to_byte(to_uchar(l) | to_uchar(r));
-  }
-
-  gsl_api inline gsl_constexpr14 byte& operator&=(byte& l, byte r) gsl_noexcept {
-#if gsl_HAVE(ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE)
-    return l = to_byte(to_uchar(l) & to_uchar(r));
-#else
-    l.v = to_uchar(l) & to_uchar(r);
-    return l;
-#endif
-  }
-
-  gsl_api inline gsl_constexpr byte operator&(byte l, byte r)gsl_noexcept {
-    return to_byte(to_uchar(l) & to_uchar(r));
-  }
-
-  gsl_api inline gsl_constexpr14 byte& operator^=(byte& l, byte r) gsl_noexcept {
-#if gsl_HAVE(ENUM_CLASS_CONSTRUCTION_FROM_UNDERLYING_TYPE)
-    return l = to_byte(to_uchar(l) ^ to_uchar(r));
-#else
-    l.v = to_uchar(l) ^ to_uchar(r);
-    return l;
-#endif
-  }
-
-  gsl_api inline gsl_constexpr byte operator^(byte l, byte r) gsl_noexcept {
-    return to_byte(to_uchar(l) ^ to_uchar(r));
-  }
-
-  gsl_api inline gsl_constexpr byte operator~(byte b) gsl_noexcept {
-    return to_byte(~to_uchar(b));
-  }
-
-#if gsl_FEATURE_TO_STD(WITH_CONTAINER)
-
-  // Tag to select span constructor taking a container (prevent ms-gsl warning C26426):
-
-  struct with_container_t {
-    gsl_constexpr with_container_t() gsl_noexcept {}
-  };
-  const gsl_constexpr with_container_t with_container;
-
-#endif
-
-  //
-  // span<> - A 1D view of contiguous T's, replace (*,len).
-  //
-  template <class T>
-  class span {
-    template <class U>
-    friend class span;
-
-   public:
-    typedef index index_type;
-
-    typedef T element_type;
-    typedef typename std11::remove_cv<T>::type value_type;
-
-    typedef T& reference;
-    typedef T* pointer;
-    typedef T const* const_pointer;
-    typedef T const& const_reference;
-
-    typedef pointer iterator;
-    typedef const_pointer const_iterator;
-
-    typedef std::reverse_iterator<iterator> reverse_iterator;
-    typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
-
-    typedef typename std::iterator_traits<iterator>::difference_type difference_type;
-
-    // 26.7.3.2 Constructors, copy, and assignment [span.cons]
-
-    gsl_api gsl_constexpr14 span() gsl_noexcept
-        : first_(gsl_nullptr),
-          last_(gsl_nullptr) {
-      Expects(size() == 0);
-    }
-
-#if !gsl_DEPRECATE_TO_LEVEL(5)
-
-#if gsl_HAVE(NULLPTR)
-    gsl_api gsl_constexpr14 span(std::nullptr_t, index_type size_in)
-        : first_(nullptr), last_(nullptr) {
-      Expects(size_in == 0);
-    }
-#endif
-
-#if gsl_HAVE(IS_DELETE)
-    gsl_api gsl_constexpr span(reference data_in)
-        : span(&data_in, 1) {}
-
-    gsl_api gsl_constexpr span(element_type&&) = delete;
-#endif
-
-#endif  // deprecate
-
-    gsl_api gsl_constexpr14 span(pointer data_in, index_type size_in)
-        : first_(data_in), last_(data_in + size_in) {
-      Expects(size_in == 0 || (size_in > 0 && data_in != gsl_nullptr));
-    }
-
-    gsl_api gsl_constexpr14 span(pointer first_in, pointer last_in)
-        : first_(first_in), last_(last_in) {
-      Expects(first_in <= last_in);
-    }
-
-#if !gsl_DEPRECATE_TO_LEVEL(5)
-
-    template <class U>
-    gsl_api gsl_constexpr14 span(U*& data_in, index_type size_in)
-        : first_(data_in), last_(data_in + size_in) {
-      Expects(size_in == 0 || (size_in > 0 && data_in != gsl_nullptr));
-    }
-
-    template <class U>
-    gsl_api gsl_constexpr14 span(U* const& data_in, index_type size_in)
-        : first_(data_in), last_(data_in + size_in) {
-      Expects(size_in == 0 || (size_in > 0 && data_in != gsl_nullptr));
-    }
-
-#endif  // deprecate
-
-#if !gsl_DEPRECATE_TO_LEVEL(5)
-    template <class U, size_t N>
-    gsl_api gsl_constexpr span(U (&arr)[N]) gsl_noexcept
-        : first_(gsl_ADDRESSOF(arr[0])),
-          last_(gsl_ADDRESSOF(arr[0]) + N) {}
-#else
-    template <size_t N
-#if gsl_HAVE(DEFAULT_FUNCTION_TEMPLATE_ARG)
-                  gsl_REQUIRES_T((std::is_convertible<value_type (*)[], element_type (*)[]>::value))
-#endif
-              >
-    gsl_api gsl_constexpr span(element_type (&arr)[N]) gsl_noexcept
-        : first_(gsl_ADDRESSOF(arr[0])),
-          last_(gsl_ADDRESSOF(arr[0]) + N) {
-    }
-#endif  // deprecate
-
-#if gsl_HAVE(ARRAY)
-#if !gsl_DEPRECATE_TO_LEVEL(5)
-
-    template <class U, size_t N>
-    gsl_api gsl_constexpr span(std::array<U, N>& arr)
-        : first_(arr.data()), last_(arr.data() + N) {}
-
-    template <class U, size_t N>
-    gsl_api gsl_constexpr span(std::array<U, N> const& arr)
-        : first_(arr.data()), last_(arr.data() + N) {}
-
-#else
-
-    template <size_t N
-#if gsl_HAVE(DEFAULT_FUNCTION_TEMPLATE_ARG)
-                  gsl_REQUIRES_T((std::is_convertible<value_type (*)[], element_type (*)[]>::value))
-#endif
-              >
-    gsl_api gsl_constexpr span(std::array<value_type, N>& arr)
-        : first_(arr.data()), last_(arr.data() + N) {
-    }
-
-    template <size_t N
-#if gsl_HAVE(DEFAULT_FUNCTION_TEMPLATE_ARG)
-                  gsl_REQUIRES_T((std::is_convertible<value_type (*)[], element_type (*)[]>::value))
-#endif
-              >
-    gsl_api gsl_constexpr span(std::array<value_type, N> const& arr)
-        : first_(arr.data()), last_(arr.data() + N) {
-    }
-
-#endif  // deprecate
-#endif  // gsl_HAVE( ARRAY )
-
-#if gsl_HAVE(CONSTRAINED_SPAN_CONTAINER_CTOR)
-    template <class Container
-                  gsl_REQUIRES_T((detail::is_compatible_container<Container, element_type>::value))>
-    gsl_api gsl_constexpr span(Container& cont)
-        : first_(std17::data(cont)), last_(std17::data(cont) + std17::size(cont)) {}
-
-    template <class Container
-                  gsl_REQUIRES_T((
-                      std::is_const<element_type>::value && detail::is_compatible_container<Container, element_type>::value))>
-    gsl_api gsl_constexpr span(Container const& cont)
-        : first_(std17::data(cont)), last_(std17::data(cont) + std17::size(cont)) {}
-
-#elif gsl_HAVE(UNCONSTRAINED_SPAN_CONTAINER_CTOR)
-
-    template <class Container>
-    gsl_api gsl_constexpr span(Container& cont)
-        : first_(cont.size() == 0 ? gsl_nullptr : gsl_ADDRESSOF(cont[0])), last_(cont.size() == 0 ? gsl_nullptr : gsl_ADDRESSOF(cont[0]) + cont.size()) {}
-
-    template <class Container>
-    gsl_api gsl_constexpr span(Container const& cont)
-        : first_(cont.size() == 0 ? gsl_nullptr : gsl_ADDRESSOF(cont[0])), last_(cont.size() == 0 ? gsl_nullptr : gsl_ADDRESSOF(cont[0]) + cont.size()) {}
-
-#endif
-
-#if gsl_FEATURE_TO_STD(WITH_CONTAINER)
-
-    template <class Container>
-    gsl_api gsl_constexpr span(with_container_t, Container& cont)
-        : first_(cont.size() == 0 ? gsl_nullptr : gsl_ADDRESSOF(cont[0])), last_(cont.size() == 0 ? gsl_nullptr : gsl_ADDRESSOF(cont[0]) + cont.size()) {}
-
-    template <class Container>
-    gsl_api gsl_constexpr span(with_container_t, Container const& cont)
-        : first_(cont.size() == 0 ? gsl_nullptr : gsl_ADDRESSOF(cont[0])), last_(cont.size() == 0 ? gsl_nullptr : gsl_ADDRESSOF(cont[0]) + cont.size()) {}
-
-#endif
-
-#if !gsl_DEPRECATE_TO_LEVEL(4)
-    // constructor taking shared_ptr deprecated since 0.29.0
-
-#if gsl_HAVE(SHARED_PTR)
-    gsl_api gsl_constexpr span(shared_ptr<element_type> const& ptr)
-        : first_(ptr.get()), last_(ptr.get() ? ptr.get() + 1 : gsl_nullptr) {}
-#endif
-
-    // constructors taking unique_ptr deprecated since 0.29.0
-
-#if gsl_HAVE(UNIQUE_PTR)
-#if gsl_HAVE(DEFAULT_FUNCTION_TEMPLATE_ARG)
-    template <class ArrayElementType = typename std::add_pointer<element_type>::type>
-#else
-    template <class ArrayElementType>
-#endif
-    gsl_api gsl_constexpr span(unique_ptr<ArrayElementType> const& ptr, index_type count)
-        : first_(ptr.get()), last_(ptr.get() + count) {
-    }
-
-    gsl_api gsl_constexpr span(unique_ptr<element_type> const& ptr)
-        : first_(ptr.get()), last_(ptr.get() ? ptr.get() + 1 : gsl_nullptr) {}
-#endif
-
-#endif  // deprecate shared_ptr, unique_ptr
-
-#if gsl_HAVE(IS_DEFAULT) && !gsl_BETWEEN(gsl_COMPILER_GNUC_VERSION, 430, 600)
-    gsl_constexpr span(span&&) gsl_noexcept = default;
-    gsl_constexpr span(span const&) = default;
-#else
-    gsl_api gsl_constexpr span(span const& other)
-        : first_(other.begin()), last_(other.end()) {}
-#endif
-
-#if gsl_HAVE(IS_DEFAULT)
-    ~span() = default;
-#else
-    ~span() {}
-#endif
-
-#if gsl_HAVE(IS_DEFAULT)
-    gsl_constexpr14 span& operator=(span&&) gsl_noexcept = default;
-    gsl_constexpr14 span& operator=(span const&) gsl_noexcept = default;
-#else
-    gsl_api span& operator=(span other) gsl_noexcept {
-      other.swap(*this);
-      return *this;
-    }
-#endif
-
-    template <class U
-#if gsl_HAVE(DEFAULT_FUNCTION_TEMPLATE_ARG)
-                  gsl_REQUIRES_T((std::is_convertible<U (*)[], element_type (*)[]>::value))
-#endif
-              >
-    gsl_api gsl_constexpr span(span<U> const& other)
-        : first_(other.begin()), last_(other.end()) {
-    }
-
-#if 0
-    // Converting from other span ?
-    template< class U > operator=();
-#endif
-
-    // 26.7.3.3 Subviews [span.sub]
-
-    gsl_api gsl_constexpr14 span first(index_type count) const gsl_noexcept {
-      Expects(count <= this->size());
-      return span(this->data(), count);
-    }
-
-    gsl_api gsl_constexpr14 span last(index_type count) const gsl_noexcept {
-      Expects(count <= this->size());
-      return span(this->data() + this->size() - count, count);
-    }
-
-    gsl_api gsl_constexpr14 span subspan(index_type offset) const gsl_noexcept {
-      Expects(offset <= this->size());
-      return span(this->data() + offset, this->size() - offset);
-    }
-
-    gsl_api gsl_constexpr14 span subspan(index_type offset, index_type count) const gsl_noexcept {
-      Expects(
-          offset <= this->size() &&
-          count <= this->size() - offset);
-      return span(this->data() + offset, count);
-    }
-
-    // 26.7.3.4 Observers [span.obs]
-
-    gsl_api gsl_constexpr index_type size() const gsl_noexcept {
-      return narrow_cast<index_type>(last_ - first_);
-    }
-
-    gsl_api gsl_constexpr std::ptrdiff_t ssize() const gsl_noexcept {
-      return narrow_cast<std::ptrdiff_t>(last_ - first_);
-    }
-
-    gsl_api gsl_constexpr index_type size_bytes() const gsl_noexcept {
-      return size() * narrow_cast<index_type>(sizeof(element_type));
-    }
-
-    gsl_api gsl_constexpr bool empty() const gsl_noexcept {
-      return size() == 0;
-    }
-
-    // 26.7.3.5 Element access [span.elem]
-
-    gsl_api gsl_constexpr reference operator[](index_type pos) const {
-      return at(pos);
-    }
-
-    gsl_api gsl_constexpr reference operator()(index_type pos) const {
-      return at(pos);
-    }
-
-    gsl_api gsl_constexpr14 reference at(index_type pos) const {
-      Expects(pos < size());
-      return first_[pos];
-    }
-
-    gsl_api gsl_constexpr pointer data() const gsl_noexcept {
-      return first_;
-    }
-
-    gsl_api gsl_constexpr reference back() const {
-      Expects(size()>0);
-      return last_[-1];
-    }
-
-    // 26.7.3.6 Iterator support [span.iterators]
-
-    gsl_api gsl_constexpr iterator begin() const gsl_noexcept {
-      return iterator(first_);
-    }
-
-    gsl_api gsl_constexpr iterator end() const gsl_noexcept {
-      return iterator(last_);
-    }
-
-    gsl_api gsl_constexpr const_iterator cbegin() const gsl_noexcept {
-#if gsl_CPP11_OR_GREATER
-      return {begin()};
-#else
-      return const_iterator(begin());
-#endif
-    }
-
-    gsl_api gsl_constexpr const_iterator cend() const gsl_noexcept {
-#if gsl_CPP11_OR_GREATER
-      return {end()};
-#else
-      return const_iterator(end());
-#endif
-    }
-
-    gsl_api gsl_constexpr reverse_iterator rbegin() const gsl_noexcept {
-      return reverse_iterator(end());
-    }
-
-    gsl_api gsl_constexpr reverse_iterator rend() const gsl_noexcept {
-      return reverse_iterator(begin());
-    }
-
-    gsl_api gsl_constexpr const_reverse_iterator crbegin() const gsl_noexcept {
-      return const_reverse_iterator(cend());
-    }
-
-    gsl_api gsl_constexpr const_reverse_iterator crend() const gsl_noexcept {
-      return const_reverse_iterator(cbegin());
-    }
-
-    gsl_api void swap(span& other) gsl_noexcept {
-      using std::swap;
-      swap(first_, other.first_);
-      swap(last_, other.last_);
-    }
-
-#if !gsl_DEPRECATE_TO_LEVEL(3)
-    // member length() deprecated since 0.29.0
-
-    gsl_api gsl_constexpr index_type length() const gsl_noexcept {
-      return size();
-    }
-
-    // member length_bytes() deprecated since 0.29.0
-
-    gsl_api gsl_constexpr index_type length_bytes() const gsl_noexcept {
-      return size_bytes();
-    }
-#endif
-
-#if !gsl_DEPRECATE_TO_LEVEL(2)
-    // member as_bytes(), as_writeable_bytes deprecated since 0.17.0
-
-    gsl_api span<const byte> as_bytes() const gsl_noexcept {
-      return span<const byte>(reinterpret_cast<const byte*>(data()), size_bytes());  // NOLINT
-    }
-
-    gsl_api span<byte> as_writeable_bytes() const gsl_noexcept {
-      return span<byte>(reinterpret_cast<byte*>(data()), size_bytes());  // NOLINT
-    }
-
-#endif
-
-    template <class U>
-    gsl_api span<U> as_span() const gsl_noexcept {
-      Expects((this->size_bytes() % sizeof(U)) == 0);
-      return span<U>(reinterpret_cast<U*>(this->data()), this->size_bytes() / sizeof(U));  // NOLINT
-    }
-
-   private:
-    pointer first_;
-    pointer last_;
-  };
-
-  // class template argument deduction guides:
-
-#if gsl_HAVE(DEDUCTION_GUIDES)  // gsl_CPP17_OR_GREATER
-
-  template <class T, size_t N>
-  span(T(&)[N])->span<T /*, N*/>;
-
-  template <class T, size_t N>
-  span(std::array<T, N>&)->span<T /*, N*/>;
-
-  template <class T, size_t N>
-  span(std::array<T, N> const&)->span<const T /*, N*/>;
-
-  template <class Container>
-  span(Container&)->span<typename Container::value_type>;
-
-  template <class Container>
-  span(Container const&)->span<const typename Container::value_type>;
-
-#endif  // gsl_HAVE( DEDUCTION_GUIDES )
-
-  // 26.7.3.7 Comparison operators [span.comparison]
-
-#if gsl_CONFIG(ALLOWS_NONSTRICT_SPAN_COMPARISON)
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr bool operator==(span<T> const& l, span<U> const& r) {
-    return l.size() == r.size() && (l.begin() == r.begin() || std::equal(l.begin(), l.end(), r.begin()));
-  }
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr bool operator<(span<T> const& l, span<U> const& r) {
-    return std::lexicographical_compare(l.begin(), l.end(), r.begin(), r.end());
-  }
-
-#else
-
-  template <class T>
-  gsl_api inline gsl_constexpr bool operator==(span<T> const& l, span<T> const& r) {
-    return l.size() == r.size() && (l.begin() == r.begin() || std::equal(l.begin(), l.end(), r.begin()));
-  }
-
-  template <class T>
-  gsl_api inline gsl_constexpr bool operator<(span<T> const& l, span<T> const& r) {
-    return std::lexicographical_compare(l.begin(), l.end(), r.begin(), r.end());
-  }
-#endif
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr bool operator!=(span<T> const& l, span<U> const& r) {
-    return !(l == r);
-  }
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr bool operator<=(span<T> const& l, span<U> const& r) {
-    return !(r < l);
-  }
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr bool operator>(span<T> const& l, span<U> const& r) {
-    return (r < l);
-  }
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr bool operator>=(span<T> const& l, span<U> const& r) {
-    return !(l < r);
-  }
-
-  // span algorithms
-
-  template <class T>
-  gsl_api inline gsl_constexpr std::size_t size(span<T> const& spn) {
-    return static_cast<std::size_t>(spn.size());
-  }
-
-  template <class T>
-  gsl_api inline gsl_constexpr std::ptrdiff_t ssize(span<T> const& spn) {
-    return spn.ssize();
-  }
-
-  namespace detail {
-
-  template <class II, class N, class OI>
-  gsl_api inline OI copy_n(II first, N count, OI result) {
-    if (count > 0) {
-      *result++ = *first;
-      for (N i = 1; i < count; ++i) {
-        *result++ = *++first;
-      }
-    }
-    return result;
-  }
-  }  // namespace detail
-
-  template <class T, class U>
-  gsl_api inline void copy(span<T> src, span<U> dest) {
-#if gsl_CPP14_OR_GREATER  // gsl_HAVE( TYPE_TRAITS ) (circumvent Travis clang 3.4)
-    static_assert(std::is_assignable<U&, T const&>::value, "Cannot assign elements of source span to elements of destination span");
-#endif
-    Expects(dest.size() >= src.size());
-    detail::copy_n(src.data(), src.size(), dest.data());
-  }
-
-  // span creator functions (see ctors)
-
-  template <class T>
-  gsl_api inline span<const byte> as_bytes(span<T> spn) gsl_noexcept {
-    return span<const byte>(reinterpret_cast<const byte*>(spn.data()), spn.size_bytes());  // NOLINT
-  }
-
-  template <class T>
-  gsl_api inline span<byte> as_writeable_bytes(span<T> spn) gsl_noexcept {
-    return span<byte>(reinterpret_cast<byte*>(spn.data()), spn.size_bytes());  // NOLINT
-  }
-
-#if gsl_FEATURE_TO_STD(MAKE_SPAN)
-
-  template <class T>
-  gsl_api inline gsl_constexpr span<T>
-  make_span(T * ptr, typename span<T>::index_type count) {
-    return span<T>(ptr, count);
-  }
-
-  template <class T>
-  gsl_api inline gsl_constexpr span<T>
-  make_span(T * first, T * last) {
-    return span<T>(first, last);
-  }
-
-  template <class T, size_t N>
-  gsl_api inline gsl_constexpr span<T>
-  make_span(T(&arr)[N]) {
-    return span<T>(gsl_ADDRESSOF(arr[0]), N);
-  }
-
-#if gsl_HAVE(ARRAY)
-
-  template <class T, size_t N>
-  gsl_api inline gsl_constexpr span<T>
-  make_span(std::array<T, N> & arr) {
-    return span<T>(arr);
-  }
-
-  template <class T, size_t N>
-  gsl_api inline gsl_constexpr span<const T>
-  make_span(std::array<T, N> const& arr) {
-    return span<const T>(arr);
-  }
-#endif
-
-#if gsl_HAVE(CONSTRAINED_SPAN_CONTAINER_CTOR) && gsl_HAVE(AUTO)
-
-  template <class Container, class EP = decltype(std17::data(std::declval<Container&>()))>
-  gsl_api inline gsl_constexpr auto
-  make_span(Container & cont)
-      ->span<typename std::remove_pointer<EP>::type> {
-    return span<typename std::remove_pointer<EP>::type>(cont);
-  }
-
-  template <class Container, class EP = decltype(std17::data(std::declval<Container&>()))>
-  gsl_api inline gsl_constexpr auto
-  make_span(Container const& cont)
-      ->span<const typename std::remove_pointer<EP>::type> {
-    return span<const typename std::remove_pointer<EP>::type>(cont);
-  }
-
-#else
-
-  template <class T>
-  gsl_api inline span<T>
-  make_span(std::vector<T> & cont) {
-    return span<T>(with_container, cont);
-  }
-
-  template <class T>
-  gsl_api inline span<const T>
-  make_span(std::vector<T> const& cont) {
-    return span<const T>(with_container, cont);
-  }
-#endif
-
-#if gsl_FEATURE_TO_STD(WITH_CONTAINER)
-
-  template <class Container>
-  gsl_api inline gsl_constexpr span<typename Container::value_type>
-  make_span(with_container_t, Container & cont) gsl_noexcept {
-    return span<typename Container::value_type>(with_container, cont);
-  }
-
-  template <class Container>
-  gsl_api inline gsl_constexpr span<const typename Container::value_type>
-  make_span(with_container_t, Container const& cont) gsl_noexcept {
-    return span<const typename Container::value_type>(with_container, cont);
-  }
-
-#endif  // gsl_FEATURE_TO_STD( WITH_CONTAINER )
-
-  template <class Ptr>
-  gsl_api inline span<typename Ptr::element_type>
-  make_span(Ptr & ptr) {
-    return span<typename Ptr::element_type>(ptr);
-  }
-
-  template <class Ptr>
-  gsl_api inline span<typename Ptr::element_type>
-  make_span(Ptr & ptr, typename span<typename Ptr::element_type>::index_type count) {
-    return span<typename Ptr::element_type>(ptr, count);
-  }
-
-#endif  // gsl_FEATURE_TO_STD( MAKE_SPAN )
-
-#if gsl_FEATURE_TO_STD(BYTE_SPAN)
-
-  template <class T>
-  gsl_api inline gsl_constexpr span<byte>
-  byte_span(T & t) gsl_noexcept {
-    return span<byte>(reinterpret_cast<byte*>(&t), sizeof(T));
-  }
-
-  template <class T>
-  gsl_api inline gsl_constexpr span<const byte>
-  byte_span(T const& t) gsl_noexcept {
-    return span<const byte>(reinterpret_cast<byte const*>(&t), sizeof(T));
-  }
-
-#endif  // gsl_FEATURE_TO_STD( BYTE_SPAN )
-
-  //
-  // basic_string_span:
-  //
-
-  template <class T>
-  class basic_string_span;
-
-  namespace detail {
-
-  template <class T>
-  struct is_basic_string_span_oracle : std11::false_type {};
-
-  template <class T>
-  struct is_basic_string_span_oracle<basic_string_span<T> > : std11::true_type {};
-
-  template <class T>
-  struct is_basic_string_span : is_basic_string_span_oracle<typename std11::remove_cv<T>::type> {};
-
-  template <class T>
-  gsl_api inline gsl_constexpr14 std::size_t string_length(T* ptr, std::size_t max) {
-    if (ptr == gsl_nullptr || max <= 0)
-      return 0;
-
-    std::size_t len = 0;
-    while (len < max && ptr[len])  // NOLINT
-      ++len;
-
-    return len;
-  }
-
-  }  // namespace detail
-
-  //
-  // basic_string_span<> - A view of contiguous characters, replace (*,len).
-  //
-  template <class T>
-  class basic_string_span {
-   public:
-    typedef T element_type;
-    typedef span<T> span_type;
-
-    typedef typename span_type::index_type index_type;
-    typedef typename span_type::difference_type difference_type;
-
-    typedef typename span_type::pointer pointer;
-    typedef typename span_type::reference reference;
-
-    typedef typename span_type::iterator iterator;
-    typedef typename span_type::const_iterator const_iterator;
-    typedef typename span_type::reverse_iterator reverse_iterator;
-    typedef typename span_type::const_reverse_iterator const_reverse_iterator;
-
-    // construction:
-
-#if gsl_HAVE(IS_DEFAULT)
-    gsl_constexpr basic_string_span() gsl_noexcept = default;
-#else
-    gsl_api gsl_constexpr basic_string_span() gsl_noexcept {}
-#endif
-
-#if gsl_HAVE(NULLPTR)
-    gsl_api gsl_constexpr basic_string_span(std::nullptr_t ptr) gsl_noexcept
-        : span_(ptr, index_type(0)) {}
-#endif
-
-    gsl_api gsl_constexpr basic_string_span(pointer ptr)
-        : span_(remove_z(ptr, (std::numeric_limits<index_type>::max)())) {}
-
-    gsl_api gsl_constexpr basic_string_span(pointer ptr, index_type count)
-        : span_(ptr, count) {}
-
-    gsl_api gsl_constexpr basic_string_span(pointer firstElem, pointer lastElem)
-        : span_(firstElem, lastElem) {}
-
-    template <std::size_t N>
-    gsl_api gsl_constexpr basic_string_span(element_type (&arr)[N])
-        : span_(remove_z(gsl_ADDRESSOF(arr[0]), N)) {}
-
-#if gsl_HAVE(ARRAY)
-
-    template <std::size_t N>
-    gsl_api gsl_constexpr basic_string_span(std::array<typename std11::remove_const<element_type>::type, N>& arr)
-        : span_(remove_z(arr)) {}
-
-    template <std::size_t N>
-    gsl_api gsl_constexpr basic_string_span(std::array<typename std11::remove_const<element_type>::type, N> const& arr)
-        : span_(remove_z(arr)) {}
-
-#endif
-
-#if gsl_HAVE(CONSTRAINED_SPAN_CONTAINER_CTOR)
-
-    // Exclude: array, [basic_string,] basic_string_span
-
-    template <class Container
-                  gsl_REQUIRES_T((
-                      !detail::is_std_array<Container>::value && !detail::is_basic_string_span<Container>::value && std::is_convertible<typename Container::pointer, pointer>::value && std::is_convertible<typename Container::pointer, decltype(std::declval<Container>().data())>::value))>
-    gsl_api gsl_constexpr basic_string_span(Container& cont)
-        : span_((cont)) {}
-
-    // Exclude: array, [basic_string,] basic_string_span
-
-    template <class Container
-                  gsl_REQUIRES_T((
-                      !detail::is_std_array<Container>::value && !detail::is_basic_string_span<Container>::value && std::is_convertible<typename Container::pointer, pointer>::value && std::is_convertible<typename Container::pointer, decltype(std::declval<Container const&>().data())>::value))>
-    gsl_api gsl_constexpr basic_string_span(Container const& cont)
-        : span_((cont)) {}
-
-#elif gsl_HAVE(UNCONSTRAINED_SPAN_CONTAINER_CTOR)
-
-    template <class Container>
-    gsl_api gsl_constexpr basic_string_span(Container& cont)
-        : span_(cont) {}
-
-    template <class Container>
-    gsl_api gsl_constexpr basic_string_span(Container const& cont)
-        : span_(cont) {}
-
-#else
-
-    template <class U>
-    gsl_api gsl_constexpr basic_string_span(span<U> const& rhs)
-        : span_(rhs) {}
-
-#endif
-
-#if gsl_FEATURE_TO_STD(WITH_CONTAINER)
-
-    template <class Container>
-    gsl_api gsl_constexpr basic_string_span(with_container_t, Container& cont)
-        : span_(with_container, cont) {}
-#endif
-
-#if gsl_HAVE(IS_DEFAULT)
-#if gsl_BETWEEN(gsl_COMPILER_GNUC_VERSION, 440, 600)
-    gsl_constexpr basic_string_span(basic_string_span const& rhs) = default;
-
-    gsl_constexpr basic_string_span(basic_string_span&& rhs) = default;
-#else
-    gsl_constexpr basic_string_span(basic_string_span const& rhs) gsl_noexcept = default;
-
-    gsl_constexpr basic_string_span(basic_string_span&& rhs) gsl_noexcept = default;
-#endif
-#endif
-
-    template <class U
-#if gsl_HAVE(DEFAULT_FUNCTION_TEMPLATE_ARG)
-                  gsl_REQUIRES_T((std::is_convertible<typename basic_string_span<U>::pointer, pointer>::value))
-#endif
-              >
-    gsl_api gsl_constexpr basic_string_span(basic_string_span<U> const& rhs)
-        : span_(reinterpret_cast<pointer>(rhs.data()), rhs.length())  // NOLINT
-    {
-    }
-
-#if gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 120
-    template <class U
-                  gsl_REQUIRES_T((std::is_convertible<typename basic_string_span<U>::pointer, pointer>::value))>
-    gsl_api gsl_constexpr basic_string_span(basic_string_span<U>&& rhs)
-        : span_(reinterpret_cast<pointer>(rhs.data()), rhs.length())  // NOLINT
-    {}
-#endif
-
-    template <class CharTraits, class Allocator>
-    gsl_api gsl_constexpr basic_string_span(
-        std::basic_string<typename std11::remove_const<element_type>::type, CharTraits, Allocator>& str)
-        : span_(gsl_ADDRESSOF(str[0]), str.length()) {}
-
-    template <class CharTraits, class Allocator>
-    gsl_api gsl_constexpr basic_string_span(
-        std::basic_string<typename std11::remove_const<element_type>::type, CharTraits, Allocator> const& str)
-        : span_(gsl_ADDRESSOF(str[0]), str.length()) {}
-
-    // destruction, assignment:
-
-#if gsl_HAVE(IS_DEFAULT)
-    ~basic_string_span() gsl_noexcept = default;
-
-    basic_string_span& operator=(basic_string_span const& rhs) gsl_noexcept = default;
-
-    basic_string_span& operator=(basic_string_span&& rhs) gsl_noexcept = default;
-#endif
-
-    // sub span:
-
-    gsl_api gsl_constexpr basic_string_span first(index_type count) const {
-      return span_.first(count);
-    }
-
-    gsl_api gsl_constexpr basic_string_span last(index_type count) const {
-      return span_.last(count);
-    }
-
-    gsl_api gsl_constexpr basic_string_span subspan(index_type offset) const {
-      return span_.subspan(offset);
-    }
-
-    gsl_api gsl_constexpr basic_string_span subspan(index_type offset, index_type count) const {
-      return span_.subspan(offset, count);
-    }
-
-    // observers:
-
-    gsl_api gsl_constexpr index_type length() const gsl_noexcept {
-      return span_.size();
-    }
-
-    gsl_api gsl_constexpr index_type size() const gsl_noexcept {
-      return span_.size();
-    }
-
-    gsl_api gsl_constexpr index_type length_bytes() const gsl_noexcept {
-      return span_.size_bytes();
-    }
-
-    gsl_api gsl_constexpr index_type size_bytes() const gsl_noexcept {
-      return span_.size_bytes();
-    }
-
-    gsl_api gsl_constexpr bool empty() const gsl_noexcept {
-      return size() == 0;
-    }
-
-    gsl_api gsl_constexpr reference operator[](index_type idx) const {
-      return span_[idx];
-    }
-
-    gsl_api gsl_constexpr reference operator()(index_type idx) const {
-      return span_[idx];
-    }
-
-    gsl_api gsl_constexpr pointer data() const gsl_noexcept {
-      return span_.data();
-    }
-
-    gsl_api iterator begin() const gsl_noexcept {
-      return span_.begin();
-    }
-
-    gsl_api iterator end() const gsl_noexcept {
-      return span_.end();
-    }
-
-    gsl_api reverse_iterator rbegin() const gsl_noexcept {
-      return span_.rbegin();
-    }
-
-    gsl_api reverse_iterator rend() const gsl_noexcept {
-      return span_.rend();
-    }
-
-    // const version not in p0123r2:
-
-    gsl_api const_iterator cbegin() const gsl_noexcept {
-      return span_.cbegin();
-    }
-
-    gsl_api const_iterator cend() const gsl_noexcept {
-      return span_.cend();
-    }
-
-    gsl_api const_reverse_iterator crbegin() const gsl_noexcept {
-      return span_.crbegin();
-    }
-
-    gsl_api const_reverse_iterator crend() const gsl_noexcept {
-      return span_.crend();
-    }
-
-   private:
-    gsl_api static gsl_constexpr14 span_type remove_z(pointer const& sz, std::size_t max) {
-      return span_type(sz, detail::string_length(sz, max));
-    }
-
-#if gsl_HAVE(ARRAY)
-    template <size_t N>
-    gsl_api static gsl_constexpr14 span_type remove_z(std::array<typename std11::remove_const<element_type>::type, N>& arr) {
-      return remove_z(gsl_ADDRESSOF(arr[0]), narrow_cast<std::size_t>(N));
-    }
-
-    template <size_t N>
-    gsl_api static gsl_constexpr14 span_type remove_z(std::array<typename std11::remove_const<element_type>::type, N> const& arr) {
-      return remove_z(gsl_ADDRESSOF(arr[0]), narrow_cast<std::size_t>(N));
-    }
-#endif
-
-   private:
-    span_type span_;
-  };
-
-  // basic_string_span comparison functions:
-
-#if gsl_CONFIG(ALLOWS_NONSTRICT_SPAN_COMPARISON)
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr14 bool operator==(basic_string_span<T> const& l, U const& u) gsl_noexcept {
-    const basic_string_span<typename std11::add_const<T>::type> r(u);
-
-    return l.size() == r.size() && std::equal(l.begin(), l.end(), r.begin());
-  }
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr14 bool operator<(basic_string_span<T> const& l, U const& u) gsl_noexcept {
-    const basic_string_span<typename std11::add_const<T>::type> r(u);
-
-    return std::lexicographical_compare(l.begin(), l.end(), r.begin(), r.end());
-  }
-
-#if gsl_HAVE(DEFAULT_FUNCTION_TEMPLATE_ARG)
-
-  template <class T, class U
-                         gsl_REQUIRES_T((!detail::is_basic_string_span<U>::value))>
-  gsl_api inline gsl_constexpr14 bool operator==(U const& u, basic_string_span<T> const& r) gsl_noexcept {
-    const basic_string_span<typename std11::add_const<T>::type> l(u);
-
-    return l.size() == r.size() && std::equal(l.begin(), l.end(), r.begin());
-  }
-
-  template <class T, class U
-                         gsl_REQUIRES_T((!detail::is_basic_string_span<U>::value))>
-  gsl_api inline gsl_constexpr14 bool operator<(U const& u, basic_string_span<T> const& r) gsl_noexcept {
-    const basic_string_span<typename std11::add_const<T>::type> l(u);
-
-    return std::lexicographical_compare(l.begin(), l.end(), r.begin(), r.end());
-  }
-#endif
-
-#else  //gsl_CONFIG( ALLOWS_NONSTRICT_SPAN_COMPARISON )
-
-  template <class T>
-  gsl_api inline gsl_constexpr14 bool operator==(basic_string_span<T> const& l, basic_string_span<T> const& r) gsl_noexcept {
-    return l.size() == r.size() && std::equal(l.begin(), l.end(), r.begin());
-  }
-
-  template <class T>
-  gsl_api inline gsl_constexpr14 bool operator<(basic_string_span<T> const& l, basic_string_span<T> const& r) gsl_noexcept {
-    return std::lexicographical_compare(l.begin(), l.end(), r.begin(), r.end());
-  }
-
-#endif  // gsl_CONFIG( ALLOWS_NONSTRICT_SPAN_COMPARISON )
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr14 bool operator!=(basic_string_span<T> const& l, U const& r) gsl_noexcept {
-    return !(l == r);
-  }
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr14 bool operator<=(basic_string_span<T> const& l, U const& r) gsl_noexcept {
-#if gsl_HAVE(DEFAULT_FUNCTION_TEMPLATE_ARG) || !gsl_CONFIG(ALLOWS_NONSTRICT_SPAN_COMPARISON)
-    return !(r < l);
-#else
-    basic_string_span<typename std11::add_const<T>::type> rr(r);
-    return !(rr < l);
-#endif
-  }
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr14 bool operator>(basic_string_span<T> const& l, U const& r) gsl_noexcept {
-#if gsl_HAVE(DEFAULT_FUNCTION_TEMPLATE_ARG) || !gsl_CONFIG(ALLOWS_NONSTRICT_SPAN_COMPARISON)
-    return (r < l);
-#else
-    basic_string_span<typename std11::add_const<T>::type> rr(r);
-    return (rr < l);
-#endif
-  }
-
-  template <class T, class U>
-  gsl_api inline gsl_constexpr14 bool operator>=(basic_string_span<T> const& l, U const& r) gsl_noexcept {
-    return !(l < r);
-  }
-
-#if gsl_HAVE(DEFAULT_FUNCTION_TEMPLATE_ARG)
-
-  template <class T, class U
-                         gsl_REQUIRES_T((!detail::is_basic_string_span<U>::value))>
-  gsl_api inline gsl_constexpr14 bool operator!=(U const& l, basic_string_span<T> const& r) gsl_noexcept {
-    return !(l == r);
-  }
-
-  template <class T, class U
-                         gsl_REQUIRES_T((!detail::is_basic_string_span<U>::value))>
-  gsl_api inline gsl_constexpr14 bool operator<=(U const& l, basic_string_span<T> const& r) gsl_noexcept {
-    return !(r < l);
-  }
-
-  template <class T, class U
-                         gsl_REQUIRES_T((!detail::is_basic_string_span<U>::value))>
-  gsl_api inline gsl_constexpr14 bool operator>(U const& l, basic_string_span<T> const& r) gsl_noexcept {
-    return (r < l);
-  }
-
-  template <class T, class U
-                         gsl_REQUIRES_T((!detail::is_basic_string_span<U>::value))>
-  gsl_api inline gsl_constexpr14 bool operator>=(U const& l, basic_string_span<T> const& r) gsl_noexcept {
-    return !(l < r);
-  }
-
-#endif  // gsl_HAVE( DEFAULT_FUNCTION_TEMPLATE_ARG )
-
-  // convert basic_string_span to byte span:
-
-  template <class T>
-  gsl_api inline span<const byte> as_bytes(basic_string_span<T> spn) gsl_noexcept {
-    return span<const byte>(reinterpret_cast<const byte*>(spn.data()), spn.size_bytes());  // NOLINT
-  }
-
-  //
-  // String types:
-  //
-
-  typedef char* zstring;
-  typedef const char* czstring;
-
-#if gsl_HAVE(WCHAR)
-  typedef wchar_t* zwstring;
-  typedef const wchar_t* cwzstring;
-#endif
-
-  typedef basic_string_span<char> string_span;
-  typedef basic_string_span<char const> cstring_span;
-
-#if gsl_HAVE(WCHAR)
-  typedef basic_string_span<wchar_t> wstring_span;
-  typedef basic_string_span<wchar_t const> cwstring_span;
-#endif
-
-  // to_string() allow (explicit) conversions from string_span to string
-
-#if 0
-
-template< class T >
-gsl_api inline std::basic_string< typename std::remove_const<T>::type > to_string( basic_string_span<T> spn )
-{
-     std::string( spn.data(), spn.length() );
-}
-
-#else
-
-  gsl_api inline std::string to_string(string_span const& spn) {
-    return std::string(spn.data(), spn.length());
-  }
-
-  gsl_api inline std::string to_string(cstring_span const& spn) {
-    return std::string(spn.data(), spn.length());
-  }
-
-#if gsl_HAVE(WCHAR)
-
-  gsl_api inline std::wstring to_string(wstring_span const& spn) {
-    return std::wstring(spn.data(), spn.length());
-  }
-
-  gsl_api inline std::wstring to_string(cwstring_span const& spn) {
-    return std::wstring(spn.data(), spn.length());
-  }
-
-#endif  // gsl_HAVE( WCHAR )
-#endif  // to_string()
-
-// Disable stream support since Apple std lib is broken
-#if 0  // Apple stream
-  //
-  // Stream output for string_span types
-  //
-
-  namespace detail {
-
-#ifdef __APPLE__
-  using streamoff = long long;
-  using streamsize = long long;
-#else
-  using streamsize = std::streamsize;
-  using streamoff = std::streamoff;
-#endif
-
-  template <class Stream>
-  gsl_api void write_padding(Stream& os, streamsize n) {
-    for (streamsize i = 0; i < n; ++i)
-      os.rdbuf()->sputc(os.fill());
-  }
-
-  template <class Stream, class Span>
-  gsl_api Stream& write_to_stream(Stream& os, Span const& spn) {
-    typename Stream::sentry sentry(os);
-
-    if (!os)
-      return os;
-
-    const streamsize length = narrow<streamsize>(spn.length());
-
-    // Whether, and how, to pad
-    const bool pad = (length < os.width());
-    const bool left_pad = pad && (os.flags() & std::ios_base::adjustfield) == std::ios_base::right;
-
-    if (left_pad)
-      write_padding(os, os.width() - length);
-
-    // Write span characters
-    os.rdbuf()->sputn(spn.begin(), length);
-
-    if (pad && !left_pad)
-      write_padding(os, os.width() - length);
-
-    // Reset output stream width
-    os.width(0);
-
-    return os;
-  }
-
-  }  // namespace detail
-
-  template <typename Traits>
-  gsl_api std::basic_ostream<char, Traits>& operator<<(std::basic_ostream<char, Traits>& os, string_span const& spn) {
-    return detail::write_to_stream(os, spn);
-  }
-
-  template <typename Traits>
-  gsl_api std::basic_ostream<char, Traits>& operator<<(std::basic_ostream<char, Traits>& os, cstring_span const& spn) {
-    return detail::write_to_stream(os, spn);
-  }
-
-#if gsl_HAVE(WCHAR)
-
-  template <typename Traits>
-  gsl_api std::basic_ostream<wchar_t, Traits>& operator<<(std::basic_ostream<wchar_t, Traits>& os, wstring_span const& spn) {
-    return detail::write_to_stream(os, spn);
-  }
-
-  template <typename Traits>
-  gsl_api std::basic_ostream<wchar_t, Traits>& operator<<(std::basic_ostream<wchar_t, Traits>& os, cwstring_span const& spn) {
-    return detail::write_to_stream(os, spn);
-  }
-
-#endif  // gsl_HAVE( WCHAR )
-#endif  // Apple stream
-
-  //
-  // ensure_sentinel()
-  //
-  // Provides a way to obtain a span from a contiguous sequence
-  // that ends with a (non-inclusive) sentinel value.
-  //
-  // Will fail-fast if sentinel cannot be found before max elements are examined.
-  //
-  namespace detail {
-
-  template <class T, class SizeType, const T Sentinel>
-  gsl_api static span<T> ensure_sentinel(T* seq, SizeType max = (std::numeric_limits<SizeType>::max)()) {
-    typedef T* pointer;
-
-    gsl_SUPPRESS_MSVC_WARNING(26429, "f.23: symbol 'cur' is never tested for nullness, it can be marked as not_null")
-
-        pointer cur = seq;
-
-    while (static_cast<SizeType>(cur - seq) < max && *cur != Sentinel)
-      ++cur;
-
-    Expects(*cur == Sentinel);
-
-    return span<T>(seq, narrow_cast<typename span<T>::index_type>(cur - seq));
-  }
-  }  // namespace detail
-
-  //
-  // ensure_z - creates a string_span for a czstring or cwzstring.
-  // Will fail fast if a null-terminator cannot be found before
-  // the limit of size_type.
-  //
-
-  template <class T>
-  gsl_api inline span<T> ensure_z(T* const& sz, size_t max = (std::numeric_limits<size_t>::max)()) {
-    return detail::ensure_sentinel<T, size_t, 0>(sz, max);
-  }
-
-  template <class T, size_t N>
-  gsl_api inline span<T> ensure_z(T(&sz)[N]) {
-    return ensure_z(gsl_ADDRESSOF(sz[0]), N);
-  }
-
-#if gsl_HAVE(TYPE_TRAITS)
-
-  template <class Container>
-  gsl_api inline span<typename std::remove_pointer<typename Container::pointer>::type>
-  ensure_z(Container & cont) {
-    return ensure_z(cont.data(), cont.length());
-  }
-#endif
-
-  //
-  // basic_zstring_span<> - A view of contiguous null-terminated characters, replace (*,len).
-  //
-
-  template <typename T>
-  class basic_zstring_span {
-   public:
-    typedef T element_type;
-    typedef span<T> span_type;
-
-    typedef typename span_type::index_type index_type;
-    typedef typename span_type::difference_type difference_type;
-
-    typedef element_type* czstring_type;
-    typedef basic_string_span<element_type> string_span_type;
-
-    gsl_api gsl_constexpr14 basic_zstring_span(span_type s)
-        : span_(s) {
-      // expects a zero-terminated span
-      Expects(s[s.size() - 1] == '\0');
-    }
-
-#if gsl_HAVE(IS_DEFAULT)
-    gsl_constexpr basic_zstring_span(basic_zstring_span const& other) = default;
-    gsl_constexpr basic_zstring_span(basic_zstring_span&& other) = default;
-    gsl_constexpr14 basic_zstring_span& operator=(basic_zstring_span const& other) = default;
-    gsl_constexpr14 basic_zstring_span& operator=(basic_zstring_span&& other) = default;
-#else
-    gsl_api gsl_constexpr basic_zstring_span(basic_zstring_span const& other) : span_(other.span_) {}
-    gsl_api gsl_constexpr basic_zstring_span& operator=(basic_zstring_span const& other) {
-      span_ = other.span_;
-      return *this;
-    }
-#endif
-
-    gsl_api gsl_constexpr bool empty() const gsl_noexcept {
-      return span_.size() == 0;
-    }
-
-    gsl_api gsl_constexpr string_span_type as_string_span() const gsl_noexcept {
-      return string_span_type(span_.data(), span_.size() > 1 ? span_.size() - 1 : 0);
-    }
-
-    gsl_api gsl_constexpr string_span_type ensure_z() const {
-      return gsl::ensure_z(span_.data(), span_.size());
-    }
-
-    gsl_api gsl_constexpr czstring_type assume_z() const gsl_noexcept {
-      return span_.data();
-    }
-
-   private:
-    span_type span_;
-  };
-
-  //
-  // zString types:
-  //
-
-  typedef basic_zstring_span<char> zstring_span;
-  typedef basic_zstring_span<char const> czstring_span;
-
-#if gsl_HAVE(WCHAR)
-  typedef basic_zstring_span<wchar_t> wzstring_span;
-  typedef basic_zstring_span<wchar_t const> cwzstring_span;
-#endif
-
-}  // namespace gsl
-
-#if gsl_CPP11_OR_GREATER || gsl_COMPILER_MSVC_VERSION >= 120
-
-namespace std {
-
-template <>
-struct hash<gsl::byte> {
- public:
-  std::size_t operator()(gsl::byte v) const gsl_noexcept {
-    return gsl::to_integer<std::size_t>(v);
-  }
-};
-
-}  // namespace std
-
-#endif
-
-gsl_RESTORE_MSVC_WARNINGS()
-
-#endif  // GSL_GSL_LITE_HPP_INCLUDED
-
-    // end of file
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index e4e26039e6..98157e6830 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -75,14 +75,14 @@ static TensorShape GetArrayShape(PyArrayObject* pyObject) {
   const int ndim = PyArray_NDIM(pyObject);
   const npy_intp* npy_dims = PyArray_DIMS(pyObject);
   auto span = gsl::make_span(npy_dims, ndim);
-  std::vector<int64_t> dims(span.cbegin(), span.cend());
+  std::vector<int64_t> dims(span.begin(), span.end());
   TensorShape shape(std::move(dims));
   return shape;
 }
 
 TensorShape GetShape(const py::array& arr) {
   auto span = gsl::make_span(arr.shape(), arr.ndim());
-  std::vector<int64_t> dims(span.cbegin(), span.cend());
+  std::vector<int64_t> dims(span.begin(), span.end());
   TensorShape shape(std::move(dims));
   return shape;
 }
diff --git a/onnxruntime/test/common/narrow_test.cc b/onnxruntime/test/common/narrow_test.cc
new file mode 100644
index 0000000000..321971162f
--- /dev/null
+++ b/onnxruntime/test/common/narrow_test.cc
@@ -0,0 +1,76 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/narrow.h"
+
+#include <complex>
+#include <limits>
+
+#include "gtest/gtest.h"
+
+// These tests were adapted from:
+// https://github.com/microsoft/GSL/blob/a3534567187d2edc428efd3f13466ff75fe5805c/tests/utils_tests.cpp#L127-L152
+
+namespace onnxruntime::test {
+
+#if defined(ORT_NO_EXCEPTIONS)
+
+#define NARROW_FAILURE_TEST_SUITE NarrowDeathTest
+#define ASSERT_NARROW_FAILURE(expr) \
+  ASSERT_DEATH((expr), "narrowing error")
+
+#else  // ^^ defined(ORT_NO_EXCEPTIONS) ^^ / vv !defined(ORT_NO_EXCEPTIONS) vv
+
+#define NARROW_FAILURE_TEST_SUITE NarrowTest
+#define ASSERT_NARROW_FAILURE(expr) \
+  ASSERT_THROW((expr), gsl::narrowing_error)
+
+#endif  // !defined(ORT_NO_EXCEPTIONS)
+
+TEST(NarrowTest, Basic) {
+  constexpr int n = 120;
+  constexpr char c = narrow<char>(n);
+  EXPECT_EQ(c, 120);
+
+  EXPECT_EQ(narrow<uint32_t>(int32_t(0)), uint32_t{0});
+  EXPECT_EQ(narrow<uint32_t>(int32_t(1)), uint32_t{1});
+  constexpr auto int32_max = std::numeric_limits<int32_t>::max();
+  EXPECT_EQ(narrow<uint32_t>(int32_max), static_cast<uint32_t>(int32_max));
+
+  EXPECT_EQ(narrow<std::complex<float>>(std::complex<double>(4, 2)), std::complex<float>(4, 2));
+}
+
+TEST(NARROW_FAILURE_TEST_SUITE, CharOutOfRange) {
+  constexpr int n = 300;
+  ASSERT_NARROW_FAILURE(narrow<char>(n));
+}
+
+TEST(NARROW_FAILURE_TEST_SUITE, MinusOneToUint32OutOfRange) {
+  ASSERT_NARROW_FAILURE(narrow<uint32_t>(int32_t(-1)));
+}
+
+TEST(NARROW_FAILURE_TEST_SUITE, Int32MinToUint32OutOfRange) {
+  constexpr auto int32_min = std::numeric_limits<int32_t>::min();
+  ASSERT_NARROW_FAILURE(narrow<uint32_t>(int32_min));
+}
+
+TEST(NARROW_FAILURE_TEST_SUITE, UnsignedOutOfRange) {
+  constexpr int n = -42;
+  ASSERT_NARROW_FAILURE(narrow<unsigned>(n));
+}
+
+namespace {
+constexpr double kDoubleWithLossyRoundTripFloatConversion = 4.2;
+static_assert(static_cast<double>(static_cast<float>(kDoubleWithLossyRoundTripFloatConversion)) !=
+              kDoubleWithLossyRoundTripFloatConversion);
+}  // namespace
+
+TEST(NARROW_FAILURE_TEST_SUITE, FloatLossyRoundTripConversion) {
+  ASSERT_NARROW_FAILURE(narrow<float>(kDoubleWithLossyRoundTripFloatConversion));
+}
+
+TEST(NARROW_FAILURE_TEST_SUITE, ComplexFloatLossyRoundTripConversion) {
+  ASSERT_NARROW_FAILURE(narrow<std::complex<float>>(std::complex<double>(kDoubleWithLossyRoundTripFloatConversion)));
+}
+
+}  // namespace onnxruntime::test
diff --git a/onnxruntime/test/common/span_utils_test.cc b/onnxruntime/test/common/span_utils_test.cc
index 46c99929d6..4ebd2ac84c 100644
--- a/onnxruntime/test/common/span_utils_test.cc
+++ b/onnxruntime/test/common/span_utils_test.cc
@@ -22,7 +22,7 @@ TEST(Common, SpanUtilsTests) {
     // list by var
     auto list = {1, 2, 3};
     auto span = AsSpan(list);
-    ASSERT_EQ(gsl::make_span(list), span);
+    ASSERT_TRUE(SpanEq(gsl::make_span(list.begin(), list.size()), span));
     // no type conversion int -> int64_t
     // use std::array
   }
@@ -43,7 +43,7 @@ TEST(Common, SpanUtilsTests) {
     std::vector<int64_t> vec = {1, 2, 3};
     auto span = AsSpan(vec);
     ASSERT_EQ(vec.size(), span.size());
-    ASSERT_EQ(gsl::make_span(vec), span);
+    ASSERT_TRUE(SpanEq(gsl::make_span(vec), span));
     f(span);
   }
 
@@ -51,7 +51,7 @@ TEST(Common, SpanUtilsTests) {
     InlinedVector<int64_t> vec = {1, 2, 3};
     auto span = AsSpan(vec);
     ASSERT_EQ(vec.size(), span.size());
-    ASSERT_EQ(gsl::make_span(vec), span);
+    ASSERT_TRUE(SpanEq(gsl::make_span(vec), span));
     f(span);
   }
 
@@ -60,7 +60,7 @@ TEST(Common, SpanUtilsTests) {
     int64_t arr[] = {1, 2, 3};
     auto span = AsSpan(arr);
     ASSERT_EQ(std::size(arr), span.size());
-    ASSERT_EQ(gsl::make_span(arr), span);
+    ASSERT_TRUE(SpanEq(gsl::make_span(arr), span));
     f(span);
   }
 }
diff --git a/onnxruntime/test/common/tensor_op_test_utils.h b/onnxruntime/test/common/tensor_op_test_utils.h
index 6ef0a58565..0e58dd64f1 100644
--- a/onnxruntime/test/common/tensor_op_test_utils.h
+++ b/onnxruntime/test/common/tensor_op_test_utils.h
@@ -6,7 +6,7 @@
 #include <random>
 #include <type_traits>
 
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 
 #include "gtest/gtest.h"
 
@@ -23,7 +23,7 @@ namespace detail {
 inline int64_t SizeFromDims(gsl::span<const int64_t> dims, gsl::span<const int64_t> strides = {}) {
   int64_t size = 1;
   if (strides.empty()) {
-    size = std::accumulate(dims.cbegin(), dims.cend(), static_cast<int64_t>(1), std::multiplies<int64_t>{});
+    size = std::accumulate(dims.begin(), dims.end(), static_cast<int64_t>(1), std::multiplies<int64_t>{});
   } else {
     ORT_ENFORCE(dims.size() == strides.size());
     for (size_t dim = 0; dim < dims.size(); ++dim) {
diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc
index 70a576a0ce..37a2f42949 100644
--- a/onnxruntime/test/contrib_ops/beam_search_test.cc
+++ b/onnxruntime/test/contrib_ops/beam_search_test.cc
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <gsl/gsl>
 #include <memory>
 #include <vector>
 #include "gtest/gtest.h"
+#include "core/common/gsl.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "test/common/cuda_op_test_utils.h"
 
@@ -87,7 +87,7 @@ TEST(BeamSearchTest, GptBeamSearchFp32) {
   ASSERT_EQ(expected_output_shape, result_ts.GetShape());
   const auto* result_vals = sequences.GetTensorData<int32_t>();
   auto result_span = gsl::make_span(result_vals, expected_output.size());
-  ASSERT_TRUE(std::equal(expected_output.cbegin(), expected_output.cend(), result_span.cbegin(), result_span.cend()));
+  ASSERT_TRUE(std::equal(expected_output.cbegin(), expected_output.cend(), result_span.begin(), result_span.end()));
 }
 
 TEST(BeamSearchTest, GptBeamSearchFp16) {
@@ -171,7 +171,7 @@ TEST(BeamSearchTest, GptBeamSearchFp16) {
     ASSERT_EQ(expected_output_shape, result_ts.GetShape());
     const auto* result_vals = sequences.GetTensorData<int32_t>();
     auto result_span = gsl::make_span(result_vals, expected_output.size());
-    ASSERT_TRUE(std::equal(expected_output.cbegin(), expected_output.cend(), result_span.cbegin(), result_span.cend()));
+    ASSERT_TRUE(std::equal(expected_output.cbegin(), expected_output.cend(), result_span.begin(), result_span.end()));
   }
 }
 
diff --git a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
index b6a3d4461f..c70f659f1b 100644
--- a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
+++ b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/common/span_utils.h"
 #include "core/framework/tensor.h"
 #include "core/session/inference_session.h"
 #include "test/common/tensor_op_test_utils.h"
@@ -41,7 +42,7 @@ void TestDynamicQuantizeMatMul(const std::vector<int64_t>& A_dims,
   });
 
   int64_t b_scale_zp_size = per_column ? B_dims.back() : 1;
-  std::vector<float> B_scale = random.Uniform<float>({b_scale_zp_size}, -0.1f, 0.1f);
+  std::vector<float> B_scale = random.Uniform<float>(AsSpan({b_scale_zp_size}), -0.1f, 0.1f);
   std::vector<T> B_zero_point(b_scale_zp_size);
   std::for_each(B_zero_point.begin(),
                 B_zero_point.end(),
@@ -49,7 +50,7 @@ void TestDynamicQuantizeMatMul(const std::vector<int64_t>& A_dims,
                   zp = static_cast<T>(random.Uniform<int32_t>(std::array<int64_t, 1>{1}, std::numeric_limits<T>::min(), std::numeric_limits<T>::max())[0]);
                 });
 
-  std::vector<float> Bias = random.Uniform<float>({B_dims.back()}, -0.1f, 0.1f);
+  std::vector<float> Bias = random.Uniform<float>(AsSpan({B_dims.back()}), -0.1f, 0.1f);
 
   OpTester test("DynamicQuantizeMatMul", 1, onnxruntime::kMSDomain);
   test.AddInput<float>("A", A_dims, A_data);
diff --git a/onnxruntime/test/contrib_ops/math/matmul_sparse_test.cc b/onnxruntime/test/contrib_ops/math/matmul_sparse_test.cc
index 6e3ead7013..8262ad95ed 100644
--- a/onnxruntime/test/contrib_ops/math/matmul_sparse_test.cc
+++ b/onnxruntime/test/contrib_ops/math/matmul_sparse_test.cc
@@ -51,7 +51,7 @@ void ConvertToCsr(gsl::span<const T> input_span,
   const auto dense_size = rows * cols;
   ASSERT_EQ(input_span.size(), static_cast<size_t>(dense_size));
 
-  const int64_t nnz = std::count_if(input_span.cbegin(), input_span.cend(),
+  const int64_t nnz = std::count_if(input_span.begin(), input_span.end(),
                                     [](T v) { return v != T(0); });
 
   std::vector<T> values;
@@ -97,7 +97,7 @@ void ConvertToCoo(gsl::span<const T> input_span,
   const auto dense_size = rows * cols;
   ASSERT_EQ(input_span.size(), static_cast<size_t>(dense_size));
 
-  const int64_t nnz = std::count_if(input_span.cbegin(), input_span.cend(),
+  const int64_t nnz = std::count_if(input_span.begin(), input_span.end(),
                                     [](T v) { return v != T(0); });
 
   std::vector<T> values;
diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index 47ac521137..7f81503ea1 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/common/span_utils.h"
 #include "core/framework/tensor.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/session/inference_session.h"
@@ -49,11 +50,11 @@ void TestMatMulIntegerToFloat(const std::vector<int64_t>& A_dims,
     return static_cast<WType>(v);
   });
 
-  std::vector<float> A_scale = random.Uniform<float>(std::array<int64_t, 1>{1}, -0.1f, 0.1f);
+  std::vector<float> A_scale = random.Uniform<float>(AsSpan<int64_t>({1}), -0.1f, 0.1f);
   std::vector<IType> A_zero_point{(std::numeric_limits<IType>::lowest() + std::numeric_limits<IType>::max() + IType(2)) / 2};
 
   int64_t b_scale_zp_size = per_column ? B_dims.back() : 1;
-  std::vector<float> B_scale = random.Uniform<float>({b_scale_zp_size}, -0.1f, 0.1f);
+  std::vector<float> B_scale = random.Uniform<float>(AsSpan({b_scale_zp_size}), -0.1f, 0.1f);
 
   std::vector<WType> B_zero_point(b_scale_zp_size);
   std::for_each(B_zero_point.begin(),
@@ -64,7 +65,7 @@ void TestMatMulIntegerToFloat(const std::vector<int64_t>& A_dims,
                                                                   std::numeric_limits<WType>::max())[0]);
                 });
 
-  std::vector<float> Bias = random.Uniform<float>({B_dims.back()}, -0.1f, 0.1f);
+  std::vector<float> Bias = random.Uniform<float>(AsSpan({B_dims.back()}), -0.1f, 0.1f);
 
   OpTester test("MatMulIntegerToFloat", 1, onnxruntime::kMSDomain);
   test.AddInput<IType>("A", A_dims, A_data);
diff --git a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
index 5c9890f4ef..c1244a922c 100644
--- a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
@@ -6,6 +6,7 @@
 #include <vector>
 
 #include "gtest/gtest.h"
+#include "core/common/span_utils.h"
 #include "test/common/quantization_test_utils.h"
 #include "test/common/tensor_op_test_utils.h"
 #include "test/common/cuda_op_test_utils.h"
@@ -875,7 +876,7 @@ void TestQuantizedAttentionPastState(int64_t batch,
   std::vector<float> bias_data = random.Gaussian<float>(bias_dims, 0.0f, 0.3f);
 
   std::vector<float> input_scale{0.005f};
-  std::vector<float> weight_scale(random.Uniform<float>({weight_scale_zp_size}, -0.01f, 0.01f));
+  std::vector<float> weight_scale(random.Uniform<float>(AsSpan({weight_scale_zp_size}), -0.01f, 0.01f));
 
   std::vector<int64_t> past_dims{2, batch, head_number, past_seq_len, head_size};
   std::vector<float> past_data = random.Gaussian<float>(past_dims, 0.0f, 0.3f);
diff --git a/onnxruntime/test/eager/ort_invoker_test.cc b/onnxruntime/test/eager/ort_invoker_test.cc
index ce437acc8a..42ade91f67 100644
--- a/onnxruntime/test/eager/ort_invoker_test.cc
+++ b/onnxruntime/test/eager/ort_invoker_test.cc
@@ -4,6 +4,7 @@
 #include "gtest/gtest.h"
 #include "core/eager/ort_kernel_invoker.h"
 #include "core/common/logging/sinks/clog_sink.h"
+#include "core/common/span_utils.h"
 #include "core/providers/cpu/cpu_execution_provider.h"
 #include "test/framework/test_utils.h"
 #include "asserts.h"
@@ -22,7 +23,7 @@ TEST(InvokerTest, Basic) {
       std::unique_ptr<logging::ISink>{new logging::CLogSink{}},
       logging::Severity::kVERBOSE, false,
       logging::LoggingManager::InstanceType::Default,
-      &logger_id); 
+      &logger_id);
   std::unique_ptr<Environment> env;
   ASSERT_STATUS_OK(Environment::Create(std::move(logging_manager), env));
   IOnnxRuntimeOpSchemaRegistryList tmp_op_registry = {};
@@ -39,7 +40,7 @@ TEST(InvokerTest, Basic) {
   ASSERT_STATUS_OK(kernel_invoker.Invoke("Add", {A, B}, result, nullptr));
   const Tensor& C = result.back().Get<Tensor>();
   auto& c_shape = C.Shape();
-  EXPECT_EQ(c_shape.GetDims(), gsl::make_span(dims_mul_x));
+  EXPECT_TRUE(SpanEq(c_shape.GetDims(), gsl::make_span(dims_mul_x)));
 
   std::vector<float> expected_result = {2.0f, 4.0f, 6.0f, 8.0f, 10.0f, 12.0f};
   auto* c_data = C.Data<float>();
@@ -55,7 +56,7 @@ TEST(InvokerTest, Inplace) {
       std::unique_ptr<logging::ISink>{new logging::CLogSink{}},
       logging::Severity::kVERBOSE, false,
       logging::LoggingManager::InstanceType::Default,
-      &logger_id); 
+      &logger_id);
   std::unique_ptr<Environment> env;
   ASSERT_STATUS_OK(Environment::Create(std::move(logging_manager), env));
   IOnnxRuntimeOpSchemaRegistryList tmp_op_registry = {};
@@ -121,7 +122,7 @@ TEST(InvokerTest, CustomOp) {
       std::unique_ptr<logging::ISink>{new logging::CLogSink{}},
       logging::Severity::kVERBOSE, false,
       logging::LoggingManager::InstanceType::Default,
-      &logger_id); 
+      &logger_id);
   std::unique_ptr<Environment> env;
   ASSERT_STATUS_OK(Environment::Create(std::move(logging_manager), env));
   ORTInvoker kernel_invoker(std::move(cpu_execution_provider), env->GetLoggingManager()->DefaultLogger(), regs);
diff --git a/onnxruntime/test/framework/execution_frame_test.cc b/onnxruntime/test/framework/execution_frame_test.cc
index 4875b988c9..ca1c0cbc38 100644
--- a/onnxruntime/test/framework/execution_frame_test.cc
+++ b/onnxruntime/test/framework/execution_frame_test.cc
@@ -195,7 +195,7 @@ TEST_F(ExecutionFrameTest, FeedInDataTest) {
   ASSERT_TRUE(mlvalue_name_idx_map.GetIdx("Y", y_idx).IsOK());
 
   vector<OrtValue> outputs;
-  ExecutionFrame frame({x_idx}, {value}, {y_idx}, outputs, {}, state);
+  ExecutionFrame frame(AsSpan({x_idx}), AsSpan({value}), AsSpan({y_idx}), outputs, {}, state);
 
   OrtValue* p_ml_value = frame.GetMutableNodeInputOrOutputMLValue(0);
   Tensor* p_tensor_arg_0 = p_ml_value ? p_ml_value->GetMutable<Tensor>() : nullptr;
@@ -271,7 +271,7 @@ TEST_F(ExecutionFrameTest, MemPatternTest) {
                        std::vector<float>(6, 1.0f), &v3);
 
   std::vector<OrtValue> outputs;
-  ExecutionFrame frame(AsSpan({x1_idx, x2_idx, x3_idx}), AsSpan({v1, v2, v3}), {t3_idx}, outputs, {}, state);
+  ExecutionFrame frame(AsSpan({x1_idx, x2_idx, x3_idx}), AsSpan({v1, v2, v3}), AsSpan({t3_idx}), outputs, {}, state);
 
   OrtValue& mlvalue3 = *frame.GetMutableNodeInputOrOutputMLValue(3);
   OrtValue& mlvalue4 = *frame.GetMutableNodeInputOrOutputMLValue(4);
@@ -358,7 +358,7 @@ TEST_F(ExecutionFrameTest, MemPatternWithExternalOutputsTest) {
   CreateMLValue<float>(cpu_allocator, std::vector<int64_t>{2, 2}, std::vector<float>(4, 1.0f), &t_value);
 
   vector<OrtValue> outputs;
-  ExecutionFrame frame({x_idx}, {x_value}, {y_idx}, outputs, {}, state);
+  ExecutionFrame frame(AsSpan({x_idx}), AsSpan({x_value}), AsSpan({y_idx}), outputs, {}, state);
 
   ASSERT_FALSE(frame.GetMutableNodeInputOrOutputMLValue(t_idx)->IsTensor());
   ASSERT_STATUS_OK(frame.SetOutputMLValue(t_idx, t_value));
diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc
index 62cc441e6d..3b38843257 100644
--- a/onnxruntime/test/framework/sparse_kernels_test.cc
+++ b/onnxruntime/test/framework/sparse_kernels_test.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/common/span_utils.h"
 #include "core/framework/data_types.h"
 
 #include "core/graph/onnx_protobuf.h"
@@ -861,7 +862,7 @@ template <>
 void RawDataChecker<MLFloat16>(gsl::span<const MLFloat16> expected_bfloat, const TensorProto& actual) {
   int64_t actual_size = ActualSize(actual);
 
-  auto expected = expected_bfloat.as_span<const uint16_t>();
+  auto expected = ReinterpretAsSpan<const uint16_t>(expected_bfloat);
   const uint16_t* raw_data = reinterpret_cast<const uint16_t*>(actual.raw_data().data());
   auto actual_span = gsl::make_span<const uint16_t>(raw_data, actual_size);
 
@@ -872,7 +873,7 @@ template <>
 void RawDataChecker<BFloat16>(gsl::span<const BFloat16> expected_bfloat, const TensorProto& actual) {
   int64_t actual_size = ActualSize(actual);
 
-  auto expected = expected_bfloat.as_span<const uint16_t>();
+  auto expected = ReinterpretAsSpan<const uint16_t>(expected_bfloat);
   const uint16_t* raw_data = reinterpret_cast<const uint16_t*>(actual.raw_data().data());
   auto actual_span = gsl::make_span<const uint16_t>(raw_data, actual_size);
 
@@ -1084,7 +1085,7 @@ void RawSparseDataChecker<BFloat16>(gsl::span<const BFloat16> expected_bfloat,
   const int64_t actual_size = ActualSize(actual);
 
   static_assert(sizeof(uint16_t) == sizeof(BFloat16), "Expecting equal sizes");
-  auto expected = expected_bfloat.as_span<const uint16_t>();
+  auto expected = ReinterpretAsSpan<const uint16_t>(expected_bfloat);
   const uint16_t* raw_data = reinterpret_cast<const uint16_t*>(actual.values().raw_data().data());
   auto actual_span = gsl::make_span<const uint16_t>(raw_data, actual_size);
 
@@ -1099,7 +1100,7 @@ void RawSparseDataChecker<MLFloat16>(gsl::span<const MLFloat16> expected_bfloat,
   const int64_t actual_size = ActualSize(actual);
 
   static_assert(sizeof(uint16_t) == sizeof(MLFloat16), "Expecting equal sizes");
-  auto expected = expected_bfloat.as_span<const uint16_t>();
+  auto expected = ReinterpretAsSpan<const uint16_t>(expected_bfloat);
   const uint16_t* raw_data = reinterpret_cast<const uint16_t*>(actual.values().raw_data().data());
   auto actual_span = gsl::make_span<const uint16_t>(raw_data, actual_size);
 
@@ -1371,16 +1372,16 @@ TEST(SparseTensorConversionTests, CsrConversion) {
     ASSERT_EQ(dense_cpu_src.Shape(), dst.DenseShape());
     ASSERT_EQ(dst.NumValues(), expected_values.size());
     auto values = dst.Values().DataAsSpan<int32_t>();
-    ASSERT_TRUE(std::equal(expected_values.cbegin(), expected_values.cend(), values.cbegin(), values.cend()));
+    ASSERT_TRUE(std::equal(expected_values.cbegin(), expected_values.cend(), values.begin(), values.end()));
 
     auto csr_view = dst.AsCsr();
     auto inner = csr_view.Inner().DataAsSpan<int64_t>();
     ASSERT_EQ(expected_inner.size(), inner.size());
-    ASSERT_TRUE(std::equal(expected_inner.cbegin(), expected_inner.cend(), inner.cbegin(), inner.cend()));
+    ASSERT_TRUE(std::equal(expected_inner.cbegin(), expected_inner.cend(), inner.begin(), inner.end()));
 
     auto outer = csr_view.Outer().DataAsSpan<int64_t>();
     ASSERT_EQ(expected_outer.size(), outer.size());
-    ASSERT_TRUE(std::equal(expected_outer.cbegin(), expected_outer.cend(), outer.cbegin(), outer.cend()));
+    ASSERT_TRUE(std::equal(expected_outer.cbegin(), expected_outer.cend(), outer.begin(), outer.end()));
 
     // Let's convert back to make sure we get the original
     Tensor dense_dst;
@@ -1391,7 +1392,7 @@ TEST(SparseTensorConversionTests, CsrConversion) {
     ASSERT_EQ(dense_dst.Shape().Size(), vector_len(dense_data));
     auto dense_values_dst = dense_dst.DataAsSpan<int32_t>();
     ASSERT_EQ(dense_values_dst.size(), dense_data.size());
-    ASSERT_TRUE(std::equal(dense_values_dst.cbegin(), dense_values_dst.cend(), dense_data.cbegin(), dense_data.cend()));
+    ASSERT_TRUE(std::equal(dense_values_dst.begin(), dense_values_dst.end(), dense_data.cbegin(), dense_data.cend()));
   }
 
   // Strings test
@@ -1404,16 +1405,16 @@ TEST(SparseTensorConversionTests, CsrConversion) {
     ASSERT_EQ(str_cpu_src.Shape(), dst.DenseShape());
     ASSERT_EQ(dst.NumValues(), expected_values_str.size());
     auto values = dst.Values().DataAsSpan<std::string>();
-    ASSERT_TRUE(std::equal(expected_values_str.cbegin(), expected_values_str.cend(), values.cbegin(), values.cend()));
+    ASSERT_TRUE(std::equal(expected_values_str.cbegin(), expected_values_str.cend(), values.begin(), values.end()));
 
     auto csr_view = dst.AsCsr();
     auto inner = csr_view.Inner().DataAsSpan<int64_t>();
     ASSERT_EQ(expected_inner.size(), inner.size());
-    ASSERT_TRUE(std::equal(expected_inner.cbegin(), expected_inner.cend(), inner.cbegin(), inner.cend()));
+    ASSERT_TRUE(std::equal(expected_inner.cbegin(), expected_inner.cend(), inner.begin(), inner.end()));
 
     auto outer = csr_view.Outer().DataAsSpan<int64_t>();
     ASSERT_EQ(expected_outer.size(), outer.size());
-    ASSERT_TRUE(std::equal(expected_outer.cbegin(), expected_outer.cend(), outer.cbegin(), outer.cend()));
+    ASSERT_TRUE(std::equal(expected_outer.cbegin(), expected_outer.cend(), outer.begin(), outer.end()));
 
     // Let's convert back to make sure we get the original
     Tensor dense_dst;
@@ -1424,7 +1425,7 @@ TEST(SparseTensorConversionTests, CsrConversion) {
     ASSERT_EQ(dense_dst.Shape().Size(), vector_len(dense_data_str));
     auto dense_values_dst = dense_dst.DataAsSpan<std::string>();
     ASSERT_EQ(dense_values_dst.size(), dense_data.size());
-    ASSERT_TRUE(std::equal(dense_values_dst.cbegin(), dense_values_dst.cend(), dense_data_str.cbegin(), dense_data_str.cend()));
+    ASSERT_TRUE(std::equal(dense_values_dst.begin(), dense_values_dst.end(), dense_data_str.cbegin(), dense_data_str.cend()));
   }
 
   {
@@ -1437,16 +1438,16 @@ TEST(SparseTensorConversionTests, CsrConversion) {
     ASSERT_EQ(str_cpu_src.DenseShape(), dense_shape);
     ASSERT_EQ(str_cpu_src.NumValues(), expected_values_str.size());
     auto values = str_cpu_src.Values().DataAsSpan<std::string>();
-    ASSERT_TRUE(std::equal(expected_values_str.cbegin(), expected_values_str.cend(), values.cbegin(), values.cend()));
+    ASSERT_TRUE(std::equal(expected_values_str.cbegin(), expected_values_str.cend(), values.begin(), values.end()));
 
     auto csr_view = str_cpu_src.AsCsr();
     auto inner = csr_view.Inner().DataAsSpan<int64_t>();
     ASSERT_EQ(expected_inner.size(), inner.size());
-    ASSERT_TRUE(std::equal(expected_inner.cbegin(), expected_inner.cend(), inner.cbegin(), inner.cend()));
+    ASSERT_TRUE(std::equal(expected_inner.cbegin(), expected_inner.cend(), inner.begin(), inner.end()));
 
     auto outer = csr_view.Outer().DataAsSpan<int64_t>();
     ASSERT_EQ(expected_outer.size(), outer.size());
-    ASSERT_TRUE(std::equal(expected_outer.cbegin(), expected_outer.cend(), outer.cbegin(), outer.cend()));
+    ASSERT_TRUE(std::equal(expected_outer.cbegin(), expected_outer.cend(), outer.begin(), outer.end()));
   }
 
 #ifdef USE_CUDA
@@ -1474,7 +1475,7 @@ TEST(SparseTensorConversionTests, CsrConversion) {
     ASSERT_EQ(cpu_dense_dst.Shape().Size(), vector_len(dense_data));
     auto dense_dst_data = cpu_dense_dst.DataAsSpan<int32_t>();
     ASSERT_EQ(dense_dst_data.size(), dense_data.size());
-    ASSERT_TRUE(std::equal(dense_dst_data.cbegin(), dense_dst_data.cend(), dense_data.cbegin(), dense_data.cend()));
+    ASSERT_TRUE(std::equal(dense_dst_data.begin(), dense_dst_data.end(), dense_data.cbegin(), dense_data.cend()));
   }
   {
     // Test cases when it is all zeros
@@ -1500,7 +1501,7 @@ TEST(SparseTensorConversionTests, CsrConversion) {
     ASSERT_STATUS_OK(dtm.CopyTensor(gpu_dense_dst, cpu_dense_dst));
     auto dense_dst_data = cpu_dense_dst.DataAsSpan<int32_t>();
     ASSERT_EQ(dense_dst_data.size(), dense_data_all_zeros.size());
-    ASSERT_TRUE(std::equal(dense_dst_data.cbegin(), dense_dst_data.cend(), dense_data_all_zeros.cbegin(), dense_data_all_zeros.cend()));
+    ASSERT_TRUE(std::equal(dense_dst_data.begin(), dense_dst_data.end(), dense_data_all_zeros.cbegin(), dense_data_all_zeros.cend()));
   }
 #endif
 }
@@ -1583,12 +1584,12 @@ TEST(SparseTensorConversionTests, CooConversion) {
     ASSERT_EQ(dense_cpu_src.Shape(), dst.DenseShape());
     ASSERT_EQ(dst.NumValues(), expected_values.size());
     auto values = dst.Values().DataAsSpan<int32_t>();
-    ASSERT_TRUE(std::equal(expected_values.cbegin(), expected_values.cend(), values.cbegin(), values.cend()));
+    ASSERT_TRUE(std::equal(expected_values.cbegin(), expected_values.cend(), values.begin(), values.end()));
     auto coo_view = dst.AsCoo();
     ASSERT_EQ(coo_view.Indices().Shape().GetDims().size(), 1U);
     auto indices = coo_view.Indices().DataAsSpan<int64_t>();
     ASSERT_EQ(indices.size(), expected_linear_indices.size());
-    ASSERT_TRUE(std::equal(indices.cbegin(), indices.cend(), expected_linear_indices.cbegin(), expected_linear_indices.cend()));
+    ASSERT_TRUE(std::equal(indices.begin(), indices.end(), expected_linear_indices.cbegin(), expected_linear_indices.cend()));
 
     // Now convert back to dense
     Tensor dense_dst;
@@ -1598,7 +1599,7 @@ TEST(SparseTensorConversionTests, CooConversion) {
     ASSERT_EQ(dense_dst.Shape(), sparse_src.DenseShape());
     auto dense_values_dst = dense_dst.DataAsSpan<int32_t>();
     ASSERT_EQ(dense_values_dst.size(), dense_data.size());
-    ASSERT_TRUE(std::equal(dense_values_dst.cbegin(), dense_values_dst.cend(), dense_data.cbegin(), dense_data.cend()));
+    ASSERT_TRUE(std::equal(dense_values_dst.begin(), dense_values_dst.end(), dense_data.cbegin(), dense_data.cend()));
   }
 
   // String test
@@ -1612,12 +1613,12 @@ TEST(SparseTensorConversionTests, CooConversion) {
     ASSERT_EQ(dst.NumValues(), expected_values_str.size());
 
     auto values = dst.Values().DataAsSpan<std::string>();
-    ASSERT_TRUE(std::equal(expected_values_str.cbegin(), expected_values_str.cend(), values.cbegin(), values.cend()));
+    ASSERT_TRUE(std::equal(expected_values_str.cbegin(), expected_values_str.cend(), values.begin(), values.end()));
     auto coo_view = dst.AsCoo();
     ASSERT_EQ(coo_view.Indices().Shape().GetDims().size(), 1U);
     auto indices = coo_view.Indices().DataAsSpan<int64_t>();
     ASSERT_EQ(indices.size(), expected_linear_indices.size());
-    ASSERT_TRUE(std::equal(indices.cbegin(), indices.cend(), expected_linear_indices.cbegin(), expected_linear_indices.cend()));
+    ASSERT_TRUE(std::equal(indices.begin(), indices.end(), expected_linear_indices.cbegin(), expected_linear_indices.cend()));
 
     // Now convert back to dense
     Tensor dense_dst;
@@ -1627,7 +1628,7 @@ TEST(SparseTensorConversionTests, CooConversion) {
     ASSERT_EQ(dense_dst.Shape(), sparse_src.DenseShape());
     auto dense_values_dst = dense_dst.DataAsSpan<std::string>();
     ASSERT_EQ(dense_values_dst.size(), dense_data_str.size());
-    ASSERT_TRUE(std::equal(dense_values_dst.cbegin(), dense_values_dst.cend(), dense_data_str.cbegin(), dense_data_str.cend()));
+    ASSERT_TRUE(std::equal(dense_values_dst.begin(), dense_values_dst.end(), dense_data_str.cbegin(), dense_data_str.cend()));
   }
 
   {
@@ -1640,12 +1641,12 @@ TEST(SparseTensorConversionTests, CooConversion) {
     ASSERT_EQ(str_cpu_src.DenseShape(), TensorShape(dense_shape));
     ASSERT_EQ(str_cpu_src.NumValues(), expected_values_str.size());
     auto values = str_cpu_src.Values().DataAsSpan<std::string>();
-    ASSERT_TRUE(std::equal(expected_values_str.cbegin(), expected_values_str.cend(), values.cbegin(), values.cend()));
+    ASSERT_TRUE(std::equal(expected_values_str.cbegin(), expected_values_str.cend(), values.begin(), values.end()));
 
     auto coo_view = str_cpu_src.AsCoo();
     auto indices = coo_view.Indices().DataAsSpan<int64_t>();
     ASSERT_EQ(expected_linear_indices.size(), indices.size());
-    ASSERT_TRUE(std::equal(expected_linear_indices.cbegin(), expected_linear_indices.cend(), indices.cbegin(), indices.cend()));
+    ASSERT_TRUE(std::equal(expected_linear_indices.cbegin(), expected_linear_indices.cend(), indices.begin(), indices.end()));
   }
 
   {
@@ -1657,13 +1658,13 @@ TEST(SparseTensorConversionTests, CooConversion) {
     ASSERT_EQ(dense_cpu_src.Shape(), dst.DenseShape());
     ASSERT_EQ(dst.NumValues(), expected_values.size());
     auto values = dst.Values().DataAsSpan<int32_t>();
-    ASSERT_TRUE(std::equal(expected_values.cbegin(), expected_values.cend(), values.cbegin(), values.cend()));
+    ASSERT_TRUE(std::equal(expected_values.cbegin(), expected_values.cend(), values.begin(), values.end()));
 
     auto coo_view = dst.AsCoo();
     ASSERT_EQ(coo_view.Indices().Shape().GetDims().size(), 2U);
     auto indices = coo_view.Indices().DataAsSpan<int64_t>();
     ASSERT_EQ(indices.size(), expected_2d_indices.size());
-    ASSERT_TRUE(std::equal(indices.cbegin(), indices.cend(), expected_2d_indices.cbegin(), expected_2d_indices.cend()));
+    ASSERT_TRUE(std::equal(indices.begin(), indices.end(), expected_2d_indices.cbegin(), expected_2d_indices.cend()));
 
     // Now convert back to dense
     Tensor dense_dst;
@@ -1673,7 +1674,7 @@ TEST(SparseTensorConversionTests, CooConversion) {
     ASSERT_EQ(dense_dst.Shape(), sparse_src.DenseShape());
     auto dense_values_dst = dense_dst.DataAsSpan<int32_t>();
     ASSERT_EQ(dense_values_dst.size(), dense_data.size());
-    ASSERT_TRUE(std::equal(dense_values_dst.cbegin(), dense_values_dst.cend(), dense_data.cbegin(), dense_data.cend()));
+    ASSERT_TRUE(std::equal(dense_values_dst.begin(), dense_values_dst.end(), dense_data.cbegin(), dense_data.cend()));
   }
 
 #ifdef USE_CUDA
@@ -1704,7 +1705,7 @@ TEST(SparseTensorConversionTests, CooConversion) {
     ASSERT_STATUS_OK(dtm.CopyTensor(gpu_dense_dst, cpu_dense_dst));
     auto dense_dst_data = cpu_dense_dst.DataAsSpan<int32_t>();
     ASSERT_EQ(dense_dst_data.size(), dense_data.size());
-    ASSERT_TRUE(std::equal(dense_dst_data.cbegin(), dense_dst_data.cend(), dense_data.cbegin(), dense_data.cend()));
+    ASSERT_TRUE(std::equal(dense_dst_data.begin(), dense_dst_data.end(), dense_data.cbegin(), dense_data.cend()));
   }
 
   {
@@ -1730,7 +1731,7 @@ TEST(SparseTensorConversionTests, CooConversion) {
     ASSERT_STATUS_OK(dtm.CopyTensor(gpu_dense_dst, cpu_dense_dst));
     auto dense_dst_data = cpu_dense_dst.DataAsSpan<int32_t>();
     ASSERT_EQ(dense_dst_data.size(), dense_data_all_zeros.size());
-    ASSERT_TRUE(std::equal(dense_dst_data.cbegin(), dense_dst_data.cend(), dense_data_all_zeros.cbegin(), dense_data_all_zeros.cend()));
+    ASSERT_TRUE(std::equal(dense_dst_data.begin(), dense_dst_data.end(), dense_data_all_zeros.cbegin(), dense_data_all_zeros.cend()));
   }
 #endif
 }
@@ -1812,13 +1813,13 @@ TEST(SparseTensorConversionTests, BlockSparse) {
     ASSERT_EQ(values_shape, own_buffer_tensor.Values().Shape());
     auto data_span = own_buffer_tensor.Values().DataAsSpan<int32_t>();
     ASSERT_EQ(data_blocks.size(), data_span.size());
-    ASSERT_TRUE(std::equal(data_blocks.cbegin(), data_blocks.cend(), data_span.cbegin(), data_span.cend()));
+    ASSERT_TRUE(std::equal(data_blocks.cbegin(), data_blocks.cend(), data_span.begin(), data_span.end()));
 
     const auto& indices = own_buffer_tensor.AsBlockSparse().Indices();
     ASSERT_EQ(indices_shape, indices.Shape());
     auto indices_span = indices.DataAsSpan<int32_t>();
     ASSERT_TRUE(std::equal(blocksparse_indices.cbegin(), blocksparse_indices.cend(),
-                           indices_span.cbegin(), indices_span.cend()));
+                           indices_span.begin(), indices_span.end()));
   }
 
   {
@@ -1831,13 +1832,13 @@ TEST(SparseTensorConversionTests, BlockSparse) {
     ASSERT_EQ(values_shape, user_buffer_tensor.Values().Shape());
     auto data_span = user_buffer_tensor.Values().DataAsSpan<int32_t>();
     ASSERT_EQ(data_blocks.size(), data_span.size());
-    ASSERT_TRUE(std::equal(data_blocks.cbegin(), data_blocks.cend(), data_span.cbegin(), data_span.cend()));
+    ASSERT_TRUE(std::equal(data_blocks.cbegin(), data_blocks.cend(), data_span.begin(), data_span.end()));
 
     const auto& indices = user_buffer_tensor.AsBlockSparse().Indices();
     ASSERT_EQ(indices_shape, indices.Shape());
     auto indices_span = indices.DataAsSpan<int32_t>();
     ASSERT_TRUE(std::equal(blocksparse_indices.cbegin(), blocksparse_indices.cend(),
-                           indices_span.cbegin(), indices_span.cend()));
+                           indices_span.begin(), indices_span.end()));
   }
 
   {
@@ -1852,13 +1853,13 @@ TEST(SparseTensorConversionTests, BlockSparse) {
     auto data_span = own_buffer_tensor.Values().DataAsSpan<std::string>();
     auto expected_span = gsl::make_span(expected_strings);
     ASSERT_EQ(expected_span.size(), data_span.size());
-    ASSERT_TRUE(std::equal(expected_span.cbegin(), expected_span.cend(), data_span.cbegin(), data_span.cend()));
+    ASSERT_TRUE(std::equal(expected_span.begin(), expected_span.end(), data_span.begin(), data_span.end()));
 
     const auto& indices = own_buffer_tensor.AsBlockSparse().Indices();
     ASSERT_EQ(indices_shape, indices.Shape());
     auto indices_span = indices.DataAsSpan<int32_t>();
     ASSERT_TRUE(std::equal(blocksparse_indices.cbegin(), blocksparse_indices.cend(),
-                           indices_span.cbegin(), indices_span.cend()));
+                           indices_span.begin(), indices_span.end()));
   }
 }
 #endif  // !defined(DISABLE_SPARSE_TENSORS)
diff --git a/onnxruntime/test/framework/tensor_shape_test.cc b/onnxruntime/test/framework/tensor_shape_test.cc
index 86c64650ba..44a70cf0b8 100644
--- a/onnxruntime/test/framework/tensor_shape_test.cc
+++ b/onnxruntime/test/framework/tensor_shape_test.cc
@@ -7,6 +7,8 @@
 
 #include "gtest/gtest.h"
 
+#include "core/common/span_utils.h"
+
 namespace onnxruntime {
 namespace utils {
 namespace test {
@@ -38,8 +40,8 @@ TEST(TensorShapeTest, VariousSizes) {
   TestShapeWithVector({12, 23, 34, 45, 56, 67, 78, 89, 90});
 
   // Test assigning a shape to a large then a small vector (causing it to switch from small block to large, then back to small)
-  TensorShapeVector small{1, 2, 3};
-  TensorShapeVector large{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  const TensorShapeVector small{1, 2, 3};
+  const TensorShapeVector large{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
 
   TensorShape shape{small};
   EXPECT_EQ(shape.GetDims(), gsl::make_span(small));
@@ -58,11 +60,11 @@ TEST(TensorShapeTest, FromExistingBuffer) {
   auto shape_copy=shape;
 
   // Pointers and sizes should match as they're the same buffer
-  EXPECT_EQ(gsl::make_span(buffer).begin(), shape.GetDims().begin());
+  EXPECT_EQ(gsl::make_span(buffer).data(), shape.GetDims().data());
   EXPECT_EQ(gsl::make_span(buffer).size(), shape.GetDims().size());
 
   // Pointers should not match as they're no longer the same buffer
-  EXPECT_NE(gsl::make_span(buffer).begin(), shape_copy.GetDims().begin());
+  EXPECT_NE(gsl::make_span(buffer).data(), shape_copy.GetDims().data());
   // Size should still match
   EXPECT_EQ(gsl::make_span(buffer).size(), shape_copy.GetDims().size());
 
diff --git a/onnxruntime/test/framework/test_utils.h b/onnxruntime/test/framework/test_utils.h
index 1e97f44629..3a0c4bec1c 100644
--- a/onnxruntime/test/framework/test_utils.h
+++ b/onnxruntime/test/framework/test_utils.h
@@ -10,7 +10,7 @@
 #include "core/providers/cpu/cpu_execution_provider.h"
 #include "core/framework/ort_value.h"
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #ifdef USE_CUDA
 #include "core/providers/providers.h"
diff --git a/onnxruntime/test/ir/graph_test.cc b/onnxruntime/test/ir/graph_test.cc
index 7ad8a66049..287d6d9723 100644
--- a/onnxruntime/test/ir/graph_test.cc
+++ b/onnxruntime/test/ir/graph_test.cc
@@ -3,6 +3,7 @@
 
 #include <iostream>
 #include "core/common/inlined_containers.h"
+#include "core/common/span_utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
@@ -258,8 +259,8 @@ static void ValidateSparseTensorProto(const SparseTensorProto& proto) {
   // Can't use ContainerEq on float
   EXPECT_EQ(actual_values.size(), sparse_details::values.size());
   // std::equal() with a predicate is only in C++20
-  auto actual_begin = actual_values.cbegin();
-  const auto actual_end = actual_values.cend();
+  auto actual_begin = actual_values.begin();
+  const auto actual_end = actual_values.end();
   auto expected_begin = sparse_details::values.cbegin();
   while (actual_begin != actual_end) {
     auto diff = *actual_begin - *expected_begin;
@@ -1542,7 +1543,7 @@ void AddAttribute(onnxruntime::Node& p_node, const std::string& attr_name, int64
   p_node.AddAttribute(attr_name, attr_value);
 }
 
-void AddAttribute(onnxruntime::Node& p_node, const std::string& attr_name, std::initializer_list<int64_t> attr_value) {
+void AddAttribute(onnxruntime::Node& p_node, const std::string& attr_name, gsl::span<const int64_t> attr_value) {
   p_node.AddAttribute(attr_name, attr_value);
 }
 
@@ -1556,7 +1557,7 @@ TEST_F(GraphTest, TypeAttribute) {
   outputs.push_back(&output_arg);
   auto& node_1 = graph.AddNode("node_1", "RandomNormal", "node 1.", inputs, outputs);
   AddAttribute(node_1, "dtype", TensorProto_DataType_FLOAT);
-  AddAttribute(node_1, "shape", {2, 3});
+  AddAttribute(node_1, "shape", AsSpan<int64_t>({2, 3}));
   auto status = graph.Resolve();
   EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
 }
diff --git a/onnxruntime/test/onnx/tensorprotoutils.cc b/onnxruntime/test/onnx/tensorprotoutils.cc
index 902ee79d1b..2f9ba70d31 100644
--- a/onnxruntime/test/onnx/tensorprotoutils.cc
+++ b/onnxruntime/test/onnx/tensorprotoutils.cc
@@ -6,7 +6,6 @@
 #include <memory>
 #include <algorithm>
 #include <limits>
-#include <gsl/gsl>
 
 #include "mem_buffer.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/test/optimizer/avx2_weight_s8_to_u8_test.cc b/onnxruntime/test/optimizer/avx2_weight_s8_to_u8_test.cc
index b25475e46c..1d125724e2 100644
--- a/onnxruntime/test/optimizer/avx2_weight_s8_to_u8_test.cc
+++ b/onnxruntime/test/optimizer/avx2_weight_s8_to_u8_test.cc
@@ -1,9 +1,13 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
 #include "core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.h"
 
 #include <functional>
 #include <string>
 #include <vector>
 
+#include "core/common/span_utils.h"
 #include "core/graph/model.h"
 #include "core/session/inference_session.h"
 #include "core/util/math_cpuonly.h"
@@ -91,14 +95,14 @@ TEST(CPU_U8S8_Precision_Tests, MatMulIntegerToFloat) {
   std::vector<uint8_t> B_data = random.Uniform<uint8_t>(B_dims, 240, 255);
 
   std::vector<float> A_scale = random.Uniform<float>(
-      std::array<int64_t, 1>{1}, -0.1f, 0.1f);
+      AsSpan<int64_t>({1}), -0.1f, 0.1f);
   std::vector<uint8_t> A_zero_point{245};
 
-  std::vector<float> B_scale = random.Uniform<float>({B_dims.back()}, -0.1f, 0.1f);
+  std::vector<float> B_scale = random.Uniform<float>(AsSpan({B_dims.back()}), -0.1f, 0.1f);
 
-  std::vector<uint8_t> B_zero_point = random.Uniform<uint8_t>(B_dims.back(), 240, 250);
+  std::vector<uint8_t> B_zero_point = random.Uniform<uint8_t>(AsSpan({B_dims.back()}), 240, 250);
 
-  std::vector<float> Bias = random.Uniform<float>({B_dims.back()}, -0.1f, 0.1f);
+  std::vector<float> Bias = random.Uniform<float>(AsSpan({B_dims.back()}), -0.1f, 0.1f);
 
   std::vector<OrtValue> baseline_fetches;
 
@@ -204,11 +208,11 @@ TEST(CPU_U8S8_Precision_Tests, DynamicQuantizeMatMul) {
 
   std::vector<uint8_t> B_data = random.Uniform<uint8_t>(B_dims, 240, 255);
 
-  std::vector<float> B_scale = random.Uniform<float>({B_dims.back()}, -0.1f, 0.1f);
+  std::vector<float> B_scale = random.Uniform<float>(AsSpan({B_dims.back()}), -0.1f, 0.1f);
 
-  std::vector<uint8_t> B_zero_point = random.Uniform<uint8_t>({B_dims.back()}, 240, 250);
+  std::vector<uint8_t> B_zero_point = random.Uniform<uint8_t>(AsSpan({B_dims.back()}), 240, 250);
 
-  std::vector<float> Bias = random.Uniform<float>({B_dims.back()}, -0.1f, 0.1f);
+  std::vector<float> Bias = random.Uniform<float>(AsSpan({B_dims.back()}), -0.1f, 0.1f);
 
   std::vector<OrtValue> baseline_fetches;
 
diff --git a/onnxruntime/test/optimizer/initializer_test.cc b/onnxruntime/test/optimizer/initializer_test.cc
index 19c71d4afa..e73950bbd1 100644
--- a/onnxruntime/test/optimizer/initializer_test.cc
+++ b/onnxruntime/test/optimizer/initializer_test.cc
@@ -8,7 +8,7 @@
 #include <numeric>
 #include <type_traits>
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "gtest/gtest.h"
 
@@ -80,7 +80,7 @@ TEST(OptimizerInitializerTest, LoadExternalData) {
 
         if (offset + length <= tensor_data_span.size()) {
           Initializer i(tensor_proto, tensor_data_dir_path);
-          EXPECT_EQ(gsl::make_span(i.data<int32_t>(), i.size()), tensor_data_span.subspan(offset, length));
+          EXPECT_EQ(i.DataAsSpan<int32_t>(), tensor_data_span.subspan(offset, length));
         } else {
           EXPECT_THROW(Initializer i(tensor_proto, tensor_data_dir_path), OnnxRuntimeException);
         }
diff --git a/onnxruntime/test/platform/file_io_test.cc b/onnxruntime/test/platform/file_io_test.cc
index 9dec13767e..1e5a516af0 100644
--- a/onnxruntime/test/platform/file_io_test.cc
+++ b/onnxruntime/test/platform/file_io_test.cc
@@ -14,10 +14,11 @@
 #include <Windows.h>
 #endif
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "gtest/gtest.h"
 
+#include "core/common/span_utils.h"
 #include "test/util/include/file_util.h"
 
 namespace onnxruntime {
@@ -102,7 +103,7 @@ TEST(FileIoTest, ReadFileIntoBuffer) {
 
     auto expected_data_span = gsl::make_span(expected_data.data() + offset, length);
 
-    ASSERT_EQ(buffer_span, expected_data_span);
+    ASSERT_TRUE(SpanEq(buffer_span, expected_data_span));
   }
 
   // invalid - negative offset
@@ -140,7 +141,7 @@ TEST(FileIoTest, MapFileIntoMemory) {
 
     auto expected_data_span = gsl::make_span(expected_data.data() + offset, length);
 
-    ASSERT_EQ(mapped_span, expected_data_span);
+    ASSERT_TRUE(SpanEq(mapped_span, expected_data_span));
   }
 
   {
@@ -185,7 +186,7 @@ TEST(FileIoTest, MapFileIntoMemory) {
 
     auto expected_data_span = gsl::make_span(expected_data.data() + offset, length);
 
-    ASSERT_EQ(mapped_span, expected_data_span);
+    ASSERT_TRUE(SpanEq(mapped_span, expected_data_span));
   }
 
   {
diff --git a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
index 7c68772bd8..8f2fb2ddaa 100644
--- a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
@@ -399,7 +399,7 @@ static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_
     }
 
     const auto output_dims = output_shape.GetDims();
-    return std::vector<int64_t>(output_dims.cbegin(), output_dims.cend());
+    return std::vector<int64_t>(output_dims.begin(), output_dims.end());
   };
 
   test.AddOutput<float>("scan_output_0", calculate_output_shape(0), output_0);
diff --git a/onnxruntime/test/providers/cpu/ml/array_feature_extractor_test.cc b/onnxruntime/test/providers/cpu/ml/array_feature_extractor_test.cc
index 4d77f9675c..d320b36b74 100644
--- a/onnxruntime/test/providers/cpu/ml/array_feature_extractor_test.cc
+++ b/onnxruntime/test/providers/cpu/ml/array_feature_extractor_test.cc
@@ -5,7 +5,7 @@
 
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 using namespace std;
 namespace onnxruntime {
 namespace test {
diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index 042334758d..bf6fe4cd3c 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -6,7 +6,6 @@
 
 #include "core/session/onnxruntime_c_api.h"
 #include "core/session/onnxruntime_cxx_api.h"
-#include "core/common/gsl_suppress.h"
 #include "core/session/ort_apis.h"
 #include "core/session/inference_session.h"
 #include "core/session/ort_env.h"
diff --git a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
index af88c44d8c..b68fd85b43 100644
--- a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
@@ -5,7 +5,7 @@
 
 #include "boost/mp11.hpp"
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/cpu/tensor/where_op_test.cc b/onnxruntime/test/providers/cpu/tensor/where_op_test.cc
index 8f0525a70f..43a0d8b4c2 100644
--- a/onnxruntime/test/providers/cpu/tensor/where_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/where_op_test.cc
@@ -3,7 +3,7 @@
 
 #include "gtest/gtest.h"
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include "test/providers/provider_test_utils.h"
 
diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
index 5727c44df3..d864eb55d5 100644
--- a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
@@ -4,6 +4,7 @@
 #if !defined(REDUCED_OPS_BUILD)  // may not work with excluded op kernel implementations
 
 #include "core/common/logging/logging.h"
+#include "core/common/span_utils.h"
 #include "core/framework/utils.h"
 #include "core/session/inference_session.h"
 #include "core/session/onnxruntime_cxx_api.h"
@@ -89,7 +90,7 @@ static void ExecuteMnist(InferenceSessionWrapper& session, bool custom_ep_enable
     const auto& initializer = session_state.GetConstantInitializedTensors().at(idx);
     const auto expected = initializer.Get<Tensor>().DataAsSpan<float>();
 
-    ASSERT_THAT(data, ::testing::ContainerEq(expected));
+    ASSERT_TRUE(SpanEq(data, expected));
   }
 }
 
diff --git a/onnxruntime/test/providers/provider_test_utils.cc b/onnxruntime/test/providers/provider_test_utils.cc
index 9a9e0218f4..54250ab78c 100644
--- a/onnxruntime/test/providers/provider_test_utils.cc
+++ b/onnxruntime/test/providers/provider_test_utils.cc
@@ -658,7 +658,7 @@ void OpTester::AddSparseCooTensorData(std::vector<Data>& data,
   auto p_tensor = MakeSparseTensor(data_type, dims);
   auto mutator = p_tensor->MakeCooData(nnz, indices.size());
   CopyDataToTensor(values, mutator.Values());
-  CopyDataToTensor(indices.as_bytes(), mutator.Indices());
+  CopyDataToTensor(gsl::as_bytes(indices), mutator.Indices());
 
   NodeArg node_arg = MakeSparseNodeArg(dtype, name, dims, dim_params);
   AddSparseTensorData(data, std::move(node_arg), std::move(p_tensor), check_params);
@@ -680,8 +680,8 @@ void OpTester::AddSparseCooTensorStrings(std::vector<Data>& data,
   auto mutator = p_tensor->MakeCooData(nnz, indices.size());
   auto mutable_values = mutator.Values().MutableDataAsSpan<std::string>();
   ORT_ENFORCE(values.size() == mutable_values.size(), "Must allocate space for values");
-  std::copy(values.cbegin(), values.cend(), mutable_values.begin());
-  CopyDataToTensor(indices.as_bytes(), mutator.Indices());
+  std::copy(values.begin(), values.end(), mutable_values.begin());
+  CopyDataToTensor(gsl::as_bytes(indices), mutator.Indices());
   NodeArg node_arg = MakeSparseNodeArg(dtype, name, dims, dim_params);
   AddSparseTensorData(data, std::move(node_arg), std::move(p_tensor), CheckParams());
 }
@@ -704,8 +704,8 @@ void OpTester::AddSparseCsrTensorData(std::vector<Data>& data,
 
   auto mutator = p_tensor->MakeCsrData(nnz, inner_indices.size(), outer_indices.size());
   CopyDataToTensor(values, mutator.Values());
-  CopyDataToTensor(inner_indices.as_bytes(), mutator.Inner());
-  CopyDataToTensor(outer_indices.as_bytes(), mutator.Outer());
+  CopyDataToTensor(gsl::as_bytes(inner_indices), mutator.Inner());
+  CopyDataToTensor(gsl::as_bytes(outer_indices), mutator.Outer());
 
   NodeArg node_arg = MakeSparseNodeArg(dtype, name, dims, dim_params);
   AddSparseTensorData(data, std::move(node_arg), std::move(p_tensor), check_params);
@@ -729,9 +729,9 @@ void OpTester::AddSparseCsrTensorStrings(std::vector<Data>& data,
   auto mutator = p_tensor->MakeCsrData(nnz, inner_indices.size(), outer_indices.size());
   auto mutable_values = mutator.Values().MutableDataAsSpan<std::string>();
   ORT_ENFORCE(values.size() == mutable_values.size(), "Must allocate space for values");
-  std::copy(values.cbegin(), values.cend(), mutable_values.begin());
-  CopyDataToTensor(inner_indices.as_bytes(), mutator.Inner());
-  CopyDataToTensor(outer_indices.as_bytes(), mutator.Outer());
+  std::copy(values.begin(), values.end(), mutable_values.begin());
+  CopyDataToTensor(gsl::as_bytes(inner_indices), mutator.Inner());
+  CopyDataToTensor(gsl::as_bytes(outer_indices), mutator.Outer());
   NodeArg node_arg = MakeSparseNodeArg(dtype, name, dims, dim_params);
   AddSparseTensorData(data, std::move(node_arg), std::move(p_tensor), CheckParams());
 }
diff --git a/onnxruntime/test/providers/provider_test_utils.h b/onnxruntime/test/providers/provider_test_utils.h
index 2a3eb621aa..6e7ad7e8cd 100644
--- a/onnxruntime/test/providers/provider_test_utils.h
+++ b/onnxruntime/test/providers/provider_test_utils.h
@@ -8,7 +8,7 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/optional.h"
@@ -288,7 +288,7 @@ class OpTester {
                          const std::vector<std::string>* dim_params = nullptr) {
     auto ml_type = DataTypeImpl::GetType<T>();
     AddSparseCooTensorData(input_data_, ml_type, name, dims,
-                           gsl::make_span(values).as_bytes(),
+                           gsl::as_bytes(gsl::make_span(values)),
                            gsl::make_span(indices),
                            CheckParams(), dim_params);
   }
@@ -338,7 +338,7 @@ class OpTester {
                          const std::vector<std::string>* dim_params = nullptr) {
     auto ml_type = DataTypeImpl::GetType<T>();
     AddSparseCsrTensorData(input_data_, ml_type, name, dims,
-                           gsl::make_span(values).as_bytes(),
+                           gsl::as_bytes(gsl::make_span(values)),
                            gsl::make_span(inner_indices),
                            gsl::make_span(outer_indices),
                            CheckParams(), dim_params);
@@ -1218,7 +1218,7 @@ inline std::vector<int64_t> GetShapeVector(const TensorShape& shape) {
   std::vector<int64_t> result;
   const auto dims = shape.GetDims();
   result.resize(dims.size());
-  result.assign(dims.cbegin(), dims.cend());
+  result.assign(dims.begin(), dims.end());
   return result;
 }
 
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index b2fb1479b6..f692dc8833 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -26,7 +26,7 @@
 #include "test_fixture.h"
 #include "utils.h"
 #include "custom_op_utils.h"
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 
 #ifdef _WIN32
 #include <Windows.h>
@@ -281,7 +281,7 @@ TEST(CApiTest, SparseOutputModel) {
 
   const auto* values_fetch = sparse_output.GetSparseTensorValues<float>();
   auto val_span = gsl::make_span(values_fetch, values.size());
-  ASSERT_TRUE(std::equal(values.cbegin(), values.cend(), val_span.cbegin(), val_span.cend()));
+  ASSERT_TRUE(std::equal(values.cbegin(), values.cend(), val_span.begin(), val_span.end()));
 
   auto indices_ts = sparse_output.GetSparseTensorIndicesTypeShapeInfo(ORT_SPARSE_COO_INDICES);
   ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, indices_ts.GetElementType());
@@ -291,7 +291,7 @@ TEST(CApiTest, SparseOutputModel) {
   const int64_t* indices = sparse_output.GetSparseTensorIndicesData<int64_t>(ORT_SPARSE_COO_INDICES, num_indices);
   ASSERT_EQ(num_indices, static_cast<size_t>(indices_shape[0]));
   auto ind_span = gsl::make_span(indices, num_indices);
-  ASSERT_TRUE(std::equal(coo_indices.cbegin(), coo_indices.cend(), ind_span.cbegin(), ind_span.cend()));
+  ASSERT_TRUE(std::equal(coo_indices.cbegin(), coo_indices.cend(), ind_span.begin(), ind_span.end()));
 }
 
 #ifndef DISABLE_CONTRIB_OPS
@@ -363,7 +363,7 @@ TEST(CApiTest, SparseInputModel) {
 
   const auto* result_vals = dense_Y.GetTensorData<float>();
   auto result_span = gsl::make_span(result_vals, Y_result.size());
-  ASSERT_TRUE(std::equal(Y_result.cbegin(), Y_result.cend(), result_span.cbegin(), result_span.cend()));
+  ASSERT_TRUE(std::equal(Y_result.cbegin(), Y_result.cend(), result_span.begin(), result_span.end()));
 }
 #endif  // DISABLE_CONTRIB_OPS
 #endif  // !defined(DISABLE_SPARSE_TENSORS)
diff --git a/onnxruntime/test/shared_lib/test_nontensor_types.cc b/onnxruntime/test/shared_lib/test_nontensor_types.cc
index bd81a83077..ede3f49c14 100644
--- a/onnxruntime/test/shared_lib/test_nontensor_types.cc
+++ b/onnxruntime/test/shared_lib/test_nontensor_types.cc
@@ -9,7 +9,7 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "test_allocator.h"
 
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@@ -61,18 +61,16 @@ TEST(CApiTest, CreateGetVectorOfMapsInt64Float) {  // support zipmap output type
   size_t num_values = seq_ort.GetCount();
   ASSERT_EQ(num_values, N);
 
+#if !defined(ORT_NO_EXCEPTIONS)
   // test negative case
   bool failed = false;
-  ORT_TRY {
+  try {
     auto temp = seq_ort.GetValue(999, default_allocator.get());
+  } catch (const Ort::Exception& e) {
+    failed = e.GetOrtErrorCode() == ORT_RUNTIME_EXCEPTION;
   }
-  ORT_CATCH(const Ort::Exception& e) {
-    ORT_HANDLE_EXCEPTION([&]() {
-      failed = e.GetOrtErrorCode() == ORT_RUNTIME_EXCEPTION;
-    });
-  }
-
   ASSERT_EQ(failed, true);
+#endif  // !defined(ORT_NO_EXCEPTIONS)
 
   // Fetch
   for (size_t idx = 0; idx < N; ++idx) {
@@ -354,7 +352,7 @@ TEST(CApiTest, SparseTensorUsingAPI) {
     {
       const auto* values = coo_st.GetSparseTensorValues<int32_t>();
       auto val_span = gsl::make_span(values, values_shape[0]);
-      ASSERT_TRUE(std::equal(expected_values.cbegin(), expected_values.cend(), val_span.cbegin(), val_span.cend()));
+      ASSERT_TRUE(std::equal(expected_values.cbegin(), expected_values.cend(), val_span.begin(), val_span.end()));
     }
 
     {
@@ -366,7 +364,7 @@ TEST(CApiTest, SparseTensorUsingAPI) {
       const int64_t* indices = coo_st.GetSparseTensorIndicesData<int64_t>(ORT_SPARSE_COO_INDICES, num_indices);
       ASSERT_EQ(num_indices, static_cast<size_t>(indices_shape[0]));
       auto ind_span = gsl::make_span(indices, num_indices);
-      ASSERT_TRUE(std::equal(expected_linear_indices.cbegin(), expected_linear_indices.cend(), ind_span.cbegin(), ind_span.cend()));
+      ASSERT_TRUE(std::equal(expected_linear_indices.cbegin(), expected_linear_indices.cend(), ind_span.begin(), ind_span.end()));
     }
   }
 
@@ -413,7 +411,7 @@ TEST(CApiTest, SparseTensorUsingAPI) {
     {
       const auto* values = csr_st.GetSparseTensorValues<int32_t>();
       auto val_span = gsl::make_span(values, expected_values.size());
-      ASSERT_TRUE(std::equal(expected_values.cbegin(), expected_values.cend(), val_span.cbegin(), val_span.cend()));
+      ASSERT_TRUE(std::equal(expected_values.cbegin(), expected_values.cend(), val_span.begin(), val_span.end()));
     }
 
     {
@@ -425,7 +423,7 @@ TEST(CApiTest, SparseTensorUsingAPI) {
       const int64_t* indices = csr_st.GetSparseTensorIndicesData<int64_t>(ORT_SPARSE_CSR_INNER_INDICES, num_indices);
       ASSERT_EQ(num_indices, expected_inner.size());
       auto ind_span = gsl::make_span(indices, num_indices);
-      ASSERT_TRUE(std::equal(expected_inner.cbegin(), expected_inner.cend(), ind_span.cbegin(), ind_span.cend()));
+      ASSERT_TRUE(std::equal(expected_inner.cbegin(), expected_inner.cend(), ind_span.begin(), ind_span.end()));
     }
 
     {
@@ -437,7 +435,7 @@ TEST(CApiTest, SparseTensorUsingAPI) {
       const int64_t* indices = csr_st.GetSparseTensorIndicesData<int64_t>(ORT_SPARSE_CSR_OUTER_INDICES, num_indices);
       ASSERT_EQ(num_indices, expected_outer.size());
       auto ind_span = gsl::make_span(indices, num_indices);
-      ASSERT_TRUE(std::equal(expected_outer.cbegin(), expected_outer.cend(), ind_span.cbegin(), ind_span.cend()));
+      ASSERT_TRUE(std::equal(expected_outer.cbegin(), expected_outer.cend(), ind_span.begin(), ind_span.end()));
     }
   }
 
@@ -481,7 +479,7 @@ TEST(CApiTest, SparseTensorUsingAPI) {
     {
       const auto* values = bsp_st.GetSparseTensorValues<int32_t>();
       auto val_span = gsl::make_span(values, data_blocks.size());
-      ASSERT_TRUE(std::equal(data_blocks.cbegin(), data_blocks.cend(), val_span.cbegin(), val_span.cend()));
+      ASSERT_TRUE(std::equal(data_blocks.cbegin(), data_blocks.cend(), val_span.begin(), val_span.end()));
     }
     {
       auto indices_ts = bsp_st.GetSparseTensorIndicesTypeShapeInfo(ORT_SPARSE_BLOCK_SPARSE_INDICES);
@@ -492,7 +490,7 @@ TEST(CApiTest, SparseTensorUsingAPI) {
       const int32_t* indices = bsp_st.GetSparseTensorIndicesData<int32_t>(ORT_SPARSE_BLOCK_SPARSE_INDICES, num_indices);
       ASSERT_EQ(num_indices, blocksparse_indices.size());
       auto ind_span = gsl::make_span(indices, num_indices);
-      ASSERT_TRUE(std::equal(blocksparse_indices.cbegin(), blocksparse_indices.cend(), ind_span.cbegin(), ind_span.cend()));
+      ASSERT_TRUE(std::equal(blocksparse_indices.cbegin(), blocksparse_indices.cend(), ind_span.begin(), ind_span.end()));
     }
   }
 }
@@ -540,7 +538,7 @@ TEST(CApiTest, SparseTensorFillSparseTensorFormatAPI) {
     {
       const auto* values = coo_st.GetSparseTensorValues<int32_t>();
       auto val_span = gsl::make_span(values, values_shape[0]);
-      ASSERT_TRUE(std::equal(expected_values.cbegin(), expected_values.cend(), val_span.cbegin(), val_span.cend()));
+      ASSERT_TRUE(std::equal(expected_values.cbegin(), expected_values.cend(), val_span.begin(), val_span.end()));
     }
 
     {
@@ -552,7 +550,7 @@ TEST(CApiTest, SparseTensorFillSparseTensorFormatAPI) {
       const int64_t* indices = coo_st.GetSparseTensorIndicesData<int64_t>(ORT_SPARSE_COO_INDICES, num_indices);
       ASSERT_EQ(num_indices, static_cast<size_t>(indices_shape[0]));
       auto ind_span = gsl::make_span(indices, num_indices);
-      ASSERT_TRUE(std::equal(expected_linear_indices.cbegin(), expected_linear_indices.cend(), ind_span.cbegin(), ind_span.cend()));
+      ASSERT_TRUE(std::equal(expected_linear_indices.cbegin(), expected_linear_indices.cend(), ind_span.begin(), ind_span.end()));
     }
   }
   {
@@ -597,7 +595,7 @@ TEST(CApiTest, SparseTensorFillSparseTensorFormatAPI) {
     {
       const auto* values = csr_st.GetSparseTensorValues<int32_t>();
       auto val_span = gsl::make_span(values, expected_values.size());
-      ASSERT_TRUE(std::equal(expected_values.cbegin(), expected_values.cend(), val_span.cbegin(), val_span.cend()));
+      ASSERT_TRUE(std::equal(expected_values.cbegin(), expected_values.cend(), val_span.begin(), val_span.end()));
     }
 
     {
@@ -609,7 +607,7 @@ TEST(CApiTest, SparseTensorFillSparseTensorFormatAPI) {
       const int64_t* indices = csr_st.GetSparseTensorIndicesData<int64_t>(ORT_SPARSE_CSR_INNER_INDICES, num_indices);
       ASSERT_EQ(num_indices, expected_inner.size());
       auto ind_span = gsl::make_span(indices, num_indices);
-      ASSERT_TRUE(std::equal(expected_inner.cbegin(), expected_inner.cend(), ind_span.cbegin(), ind_span.cend()));
+      ASSERT_TRUE(std::equal(expected_inner.cbegin(), expected_inner.cend(), ind_span.begin(), ind_span.end()));
     }
 
     {
@@ -621,7 +619,7 @@ TEST(CApiTest, SparseTensorFillSparseTensorFormatAPI) {
       const int64_t* indices = csr_st.GetSparseTensorIndicesData<int64_t>(ORT_SPARSE_CSR_OUTER_INDICES, num_indices);
       ASSERT_EQ(num_indices, expected_outer.size());
       auto ind_span = gsl::make_span(indices, num_indices);
-      ASSERT_TRUE(std::equal(expected_outer.cbegin(), expected_outer.cend(), ind_span.cbegin(), ind_span.cend()));
+      ASSERT_TRUE(std::equal(expected_outer.cbegin(), expected_outer.cend(), ind_span.begin(), ind_span.end()));
     }
   }
   {
@@ -664,7 +662,7 @@ TEST(CApiTest, SparseTensorFillSparseTensorFormatAPI) {
     {
       const auto* values = bsp_st.GetSparseTensorValues<int32_t>();
       auto val_span = gsl::make_span(values, data_blocks.size());
-      ASSERT_TRUE(std::equal(data_blocks.cbegin(), data_blocks.cend(), val_span.cbegin(), val_span.cend()));
+      ASSERT_TRUE(std::equal(data_blocks.cbegin(), data_blocks.cend(), val_span.begin(), val_span.end()));
     }
     {
       auto indices_ts = bsp_st.GetSparseTensorIndicesTypeShapeInfo(ORT_SPARSE_BLOCK_SPARSE_INDICES);
@@ -675,7 +673,7 @@ TEST(CApiTest, SparseTensorFillSparseTensorFormatAPI) {
       const int32_t* indices = bsp_st.GetSparseTensorIndicesData<int32_t>(ORT_SPARSE_BLOCK_SPARSE_INDICES, num_indices);
       ASSERT_EQ(num_indices, blocksparse_indices.size());
       auto ind_span = gsl::make_span(indices, num_indices);
-      ASSERT_TRUE(std::equal(blocksparse_indices.cbegin(), blocksparse_indices.cend(), ind_span.cbegin(), ind_span.cend()));
+      ASSERT_TRUE(std::equal(blocksparse_indices.cbegin(), blocksparse_indices.cend(), ind_span.begin(), ind_span.end()));
     }
   }
 }
@@ -756,7 +754,7 @@ TEST(CApiTest, SparseTensorFillSparseFormatStringsAPI) {
       const int64_t* indices = coo_st.GetSparseTensorIndicesData<int64_t>(ORT_SPARSE_COO_INDICES, num_indices);
       ASSERT_EQ(num_indices, static_cast<size_t>(indices_shape[0]));
       auto ind_span = gsl::make_span(indices, num_indices);
-      ASSERT_TRUE(std::equal(expected_linear_indices.cbegin(), expected_linear_indices.cend(), ind_span.cbegin(), ind_span.cend()));
+      ASSERT_TRUE(std::equal(expected_linear_indices.cbegin(), expected_linear_indices.cend(), ind_span.begin(), ind_span.end()));
     }
   }
   {
@@ -832,7 +830,7 @@ TEST(CApiTest, SparseTensorFillSparseFormatStringsAPI) {
       const int64_t* indices = csr_st.GetSparseTensorIndicesData<int64_t>(ORT_SPARSE_CSR_INNER_INDICES, num_indices);
       ASSERT_EQ(num_indices, expected_inner.size());
       auto ind_span = gsl::make_span(indices, num_indices);
-      ASSERT_TRUE(std::equal(expected_inner.cbegin(), expected_inner.cend(), ind_span.cbegin(), ind_span.cend()));
+      ASSERT_TRUE(std::equal(expected_inner.cbegin(), expected_inner.cend(), ind_span.begin(), ind_span.end()));
     }
 
     {
@@ -844,7 +842,7 @@ TEST(CApiTest, SparseTensorFillSparseFormatStringsAPI) {
       const int64_t* indices = csr_st.GetSparseTensorIndicesData<int64_t>(ORT_SPARSE_CSR_OUTER_INDICES, num_indices);
       ASSERT_EQ(num_indices, expected_outer.size());
       auto ind_span = gsl::make_span(indices, num_indices);
-      ASSERT_TRUE(std::equal(expected_outer.cbegin(), expected_outer.cend(), ind_span.cbegin(), ind_span.cend()));
+      ASSERT_TRUE(std::equal(expected_outer.cbegin(), expected_outer.cend(), ind_span.begin(), ind_span.end()));
     }
   }
   {
@@ -919,8 +917,8 @@ TEST(CApiTest, SparseTensorFillSparseFormatStringsAPI) {
       const int32_t* indices = bsp_st.GetSparseTensorIndicesData<int32_t>(ORT_SPARSE_BLOCK_SPARSE_INDICES, num_indices);
       ASSERT_EQ(num_indices, blocksparse_indices.size());
       auto ind_span = gsl::make_span(indices, num_indices);
-      ASSERT_TRUE(std::equal(blocksparse_indices.cbegin(), blocksparse_indices.cend(), ind_span.cbegin(), ind_span.cend()));
+      ASSERT_TRUE(std::equal(blocksparse_indices.cbegin(), blocksparse_indices.cend(), ind_span.begin(), ind_span.end()));
     }
   }
 }
-#endif  // !defined(DISABLE_SPARSE_TENSORS)
\ No newline at end of file
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
diff --git a/onnxruntime/test/testdata/custom_execution_provider_library/my_ep_factory.cc b/onnxruntime/test/testdata/custom_execution_provider_library/my_ep_factory.cc
index 02f381e0db..42b9e8b48b 100644
--- a/onnxruntime/test/testdata/custom_execution_provider_library/my_ep_factory.cc
+++ b/onnxruntime/test/testdata/custom_execution_provider_library/my_ep_factory.cc
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-#include "core/common/gsl_suppress.h"
 #include "my_ep_factory.h"
 #include "my_execution_provider.h"
+#include "core/common/gsl.h"
 #include "core/providers/shared/common.h"
 #include <iostream>
 #include "core/framework/provider_options_utils.h"
@@ -90,4 +90,4 @@ ORT_API(size_t, ProviderHashFunc, const void* provider_options){
   return info.device_id;
 }
 
-}
\ No newline at end of file
+}
diff --git a/onnxruntime/test/util/test_utils.cc b/onnxruntime/test/util/test_utils.cc
index 31401aeeb9..b2c32b486a 100644
--- a/onnxruntime/test/util/test_utils.cc
+++ b/onnxruntime/test/util/test_utils.cc
@@ -3,6 +3,8 @@
 
 #include "test/util/include/test_utils.h"
 
+#include "core/common/narrow.h"
+#include "core/common/span_utils.h"
 #include "core/framework/ort_value.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/session/inference_session.h"
@@ -25,19 +27,19 @@ static void VerifyOutputs(const std::vector<std::string>& output_names,
   for (size_t i = 0, end = expected_fetches.size(); i < end; ++i) {
     auto& ltensor = expected_fetches[i].Get<Tensor>();
     auto& rtensor = fetches[i].Get<Tensor>();
-    ASSERT_EQ(ltensor.Shape().GetDims(), rtensor.Shape().GetDims());
+    ASSERT_TRUE(SpanEq(ltensor.Shape().GetDims(), rtensor.Shape().GetDims()));
     auto element_type = ltensor.GetElementType();
     switch (element_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_INT32:
-        EXPECT_THAT(ltensor.DataAsSpan<int32_t>(), ::testing::ContainerEq(rtensor.DataAsSpan<int32_t>()))
+        EXPECT_TRUE(SpanEq(ltensor.DataAsSpan<int32_t>(), rtensor.DataAsSpan<int32_t>()))
             << " mismatch for " << output_names[i];
         break;
       case ONNX_NAMESPACE::TensorProto_DataType_INT64:
-        EXPECT_THAT(ltensor.DataAsSpan<int64_t>(), ::testing::ContainerEq(rtensor.DataAsSpan<int64_t>()))
+        EXPECT_TRUE(SpanEq(ltensor.DataAsSpan<int64_t>(), rtensor.DataAsSpan<int64_t>()))
             << " mismatch for " << output_names[i];
         break;
       case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
-        EXPECT_THAT(ltensor.DataAsSpan<uint8_t>(), ::testing::ContainerEq(rtensor.DataAsSpan<uint8_t>()))
+        EXPECT_TRUE(SpanEq(ltensor.DataAsSpan<uint8_t>(), rtensor.DataAsSpan<uint8_t>()))
             << " mismatch for " << output_names[i];
         break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
@@ -171,7 +173,7 @@ void SparseIndicesChecker(const ONNX_NAMESPACE::TensorProto& indices_proto, gsl:
   gsl::span<const int64_t> ind_span;
   std::vector<int64_t> converted_indices;
   TensorShape ind_shape(indices_proto.dims().data(), indices_proto.dims().size());
-  const auto elements = gsl::narrow<size_t>(ind_shape.Size());
+  const auto elements = narrow<size_t>(ind_shape.Size());
   const bool has_raw_data = indices_proto.has_raw_data();
   switch (indices_proto.data_type()) {
     case ONNX_NAMESPACE::TensorProto_DataType_INT64: {
@@ -179,7 +181,7 @@ void SparseIndicesChecker(const ONNX_NAMESPACE::TensorProto& indices_proto, gsl:
         const auto& rd = indices_proto.raw_data();
         ASSERT_EQ(rd.size(), elements * sizeof(int64_t));
         ASSERT_STATUS_OK(utils::UnpackInitializerData(indices_proto, model_path, unpack_buffer));
-        ind_span = gsl::make_span(unpack_buffer).as_span<const int64_t>();
+        ind_span = ReinterpretAsSpan<const int64_t>(gsl::make_span(unpack_buffer));
       } else {
         ind_span = gsl::make_span(indices_proto.int64_data().cbegin(), indices_proto.int64_data().cend());
       }
@@ -190,8 +192,8 @@ void SparseIndicesChecker(const ONNX_NAMESPACE::TensorProto& indices_proto, gsl:
         const auto& rd = indices_proto.raw_data();
         ASSERT_EQ(rd.size(), elements * sizeof(int32_t));
         ASSERT_STATUS_OK(utils::UnpackInitializerData(indices_proto, model_path, unpack_buffer));
-        auto int32_span = gsl::make_span(unpack_buffer).as_span<const int32_t>();
-        converted_indices.insert(converted_indices.cend(), int32_span.cbegin(), int32_span.cend());
+        auto int32_span = ReinterpretAsSpan<const int32_t>(gsl::make_span(unpack_buffer));
+        converted_indices.insert(converted_indices.cend(), int32_span.begin(), int32_span.end());
       } else {
         converted_indices.insert(converted_indices.cend(), indices_proto.int32_data().cbegin(), indices_proto.int32_data().cend());
       }
@@ -203,8 +205,8 @@ void SparseIndicesChecker(const ONNX_NAMESPACE::TensorProto& indices_proto, gsl:
       const auto& rd = indices_proto.raw_data();
       ASSERT_EQ(rd.size(), elements * sizeof(int16_t));
       ASSERT_STATUS_OK(utils::UnpackInitializerData(indices_proto, model_path, unpack_buffer));
-      auto int16_span = gsl::make_span(unpack_buffer).as_span<const int16_t>();
-      converted_indices.insert(converted_indices.cend(), int16_span.cbegin(), int16_span.cend());
+      auto int16_span = ReinterpretAsSpan<const int16_t>(gsl::make_span(unpack_buffer));
+      converted_indices.insert(converted_indices.cend(), int16_span.begin(), int16_span.end());
       ind_span = gsl::make_span(converted_indices);
       break;
     }
@@ -213,15 +215,15 @@ void SparseIndicesChecker(const ONNX_NAMESPACE::TensorProto& indices_proto, gsl:
       const auto& rd = indices_proto.raw_data();
       ASSERT_EQ(rd.size(), elements);
       ASSERT_STATUS_OK(utils::UnpackInitializerData(indices_proto, model_path, unpack_buffer));
-      auto int8_span = gsl::make_span(unpack_buffer).as_span<const int8_t>();
-      converted_indices.insert(converted_indices.cend(), int8_span.cbegin(), int8_span.cend());
+      auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(unpack_buffer));
+      converted_indices.insert(converted_indices.cend(), int8_span.begin(), int8_span.end());
       ind_span = gsl::make_span(converted_indices);
       break;
     }
     default:
       ASSERT_TRUE(false);
   }
-  ASSERT_THAT(ind_span, testing::ContainerEq(expected_indicies));
+  ASSERT_TRUE(SpanEq(ind_span, expected_indicies));
 }
 
 #endif  // DISABLE_SPARSE_TENSORS
diff --git a/orttraining/orttraining/core/framework/pipeline.h b/orttraining/orttraining/core/framework/pipeline.h
index 58b7f58338..a93ba1081d 100644
--- a/orttraining/orttraining/core/framework/pipeline.h
+++ b/orttraining/orttraining/core/framework/pipeline.h
@@ -12,7 +12,6 @@
 #include <thread>
 #include <unordered_map>
 
-#include "gsl/gsl"
 #include "orttraining/core/framework/distributed_run_context.h"
 #include "core/framework/ort_value.h"
 
@@ -361,4 +360,4 @@ struct PipelineContext {
 
 }  // namespace pipeline
 }  // namespace training
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/orttraining/orttraining/core/graph/optimizer_builder.h b/orttraining/orttraining/core/graph/optimizer_builder.h
index 18d2b774d5..2597589c2f 100644
--- a/orttraining/orttraining/core/graph/optimizer_builder.h
+++ b/orttraining/orttraining/core/graph/optimizer_builder.h
@@ -19,7 +19,7 @@ const std::string LAMB_STEP_TENSOR_NAME = "Step";
 const std::string ADAM_UC_PREFIX = "Update_Count";
 
 namespace training_internal {
-constexpr int64_t single_span_element = 1;
+constexpr auto single_element_dims = std::array{int64_t{1}};
 }
 
 template <class T>
@@ -34,12 +34,12 @@ template <class T>
 inline ONNX_NAMESPACE::TensorProto CreateTensorProto(
     const std::string& name,
     const std::vector<T>& values,
-    gsl::span<const int64_t> dims = gsl::span<const int64_t>(training_internal::single_span_element)) {
-  const size_t count = static_cast<size_t>(std::accumulate(dims.cbegin(), dims.cend(), int64_t(1), std::multiplies<int64_t>{}));
+    gsl::span<const int64_t> dims = training_internal::single_element_dims) {
+  const size_t count = static_cast<size_t>(std::accumulate(dims.begin(), dims.end(), int64_t(1), std::multiplies<int64_t>{}));
   ORT_ENFORCE(values.size() == count);
   ONNX_NAMESPACE::TensorProto tensor_proto = ONNX_NAMESPACE::ToTensor<T>(values);
   tensor_proto.set_name(name);
-  std::for_each(dims.cbegin(), dims.cend(), [&](auto dim) { tensor_proto.add_dims(dim); });
+  std::for_each(dims.begin(), dims.end(), [&](auto dim) { tensor_proto.add_dims(dim); });
   return tensor_proto;
 }
 
@@ -47,10 +47,10 @@ template <class T>
 inline ONNX_NAMESPACE::TensorProto CreateTensorProto(
     const std::string& name,
     gsl::span<const T> values_span,
-    gsl::span<const int64_t> dims = gsl::span<const int64_t>(training_internal::single_span_element)) {
+    gsl::span<const int64_t> dims = training_internal::single_element_dims) {
   std::vector<T> values;
   values.reserve(values_span.size());
-  values.assign(values_span.cbegin(), values_span.cend());
+  values.assign(values_span.begin(), values_span.end());
   return CreateTensorProto(name, values, dims);
 }
 
@@ -59,12 +59,12 @@ template <class T>
 inline ONNX_NAMESPACE::TensorProto CreateTensorProto(
     const std::string& name,
     T val,
-    gsl::span<const int64_t> dims = gsl::span<const int64_t>(training_internal::single_span_element)) {
-  size_t count = static_cast<size_t>(std::accumulate(dims.cbegin(), dims.cend(), int64_t(1), std::multiplies<int64_t>{}));
+    gsl::span<const int64_t> dims = training_internal::single_element_dims) {
+  size_t count = static_cast<size_t>(std::accumulate(dims.begin(), dims.end(), int64_t(1), std::multiplies<int64_t>{}));
   std::vector<T> values(count, val);
   ONNX_NAMESPACE::TensorProto tensor_proto = ONNX_NAMESPACE::ToTensor<T>(values);
   tensor_proto.set_name(name);
-  std::for_each(dims.cbegin(), dims.cend(), [&](auto dim) { tensor_proto.add_dims(dim); });
+  std::for_each(dims.begin(), dims.end(), [&](auto dim) { tensor_proto.add_dims(dim); });
   return tensor_proto;
 }
 
diff --git a/orttraining/orttraining/core/graph/zero_optimizer_graph_builder.cc b/orttraining/orttraining/core/graph/zero_optimizer_graph_builder.cc
index c44b593d78..5b8e089dda 100644
--- a/orttraining/orttraining/core/graph/zero_optimizer_graph_builder.cc
+++ b/orttraining/orttraining/core/graph/zero_optimizer_graph_builder.cc
@@ -4,6 +4,7 @@
 #include "orttraining/core/graph/zero_optimizer_graph_builder.h"
 
 #include "core/common/common.h"
+#include "core/common/span_utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/graph.h"
 #include "core/graph/graph_utils.h"
@@ -141,12 +142,12 @@ static std::vector<ArgDef> AddPartitionsForParameter(
         //add the modified weight name to get state
         updated_weight_names_map[initializer_name] = partition_name;
 
-        auto partition_argdef = ArgDef(partition_name, graph_defs.CreateTypeProto({partition_size}, dtype));
+        auto partition_argdef = ArgDef(partition_name, graph_defs.CreateTypeProto(AsSpan({partition_size}), dtype));
 
         view_outputs.push_back(partition_argdef);
       } else {
         auto dtype = ONNX_NAMESPACE::TensorProto_DataType_FLOAT;
-        auto partition_argdef = ArgDef(partition_name, graph_defs.CreateTypeProto(std::array<const int64_t, 1>{shapes[i].Size()}, dtype));
+        auto partition_argdef = ArgDef(partition_name, graph_defs.CreateTypeProto(AsSpan({shapes[i].Size()}), dtype));
         view_outputs.push_back(partition_argdef);
       }
       view_num++;
@@ -167,8 +168,8 @@ static std::vector<ArgDef> AddViewForParameter(
       const int64_t dims = shape.NumDimensions();
 
       ArgDef shape_argdef(argdef.name + "_view_shape_" + std::to_string(view_num),
-                          graph_defs.CreateTypeProto({dims}, ONNX_NAMESPACE::TensorProto_DataType_INT64));
-      graph_defs.AddInitializers({CreateTensorProto<int64_t>(shape_argdef.name, shape.AsShapeVector(), {dims})});
+                          graph_defs.CreateTypeProto(AsSpan({dims}), ONNX_NAMESPACE::TensorProto_DataType_INT64));
+      graph_defs.AddInitializers({CreateTensorProto<int64_t>(shape_argdef.name, shape.AsShapeVector(), AsSpan({dims}))});
 
       auto dtype = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(argdef.type_proto->tensor_type().elem_type());
       ArgDef view_argdef(GetViewName(argdef.name, view_num),
diff --git a/orttraining/orttraining/core/optimizer/graph_transformer_utils.h b/orttraining/orttraining/core/optimizer/graph_transformer_utils.h
index d0bba49746..011a007ab6 100644
--- a/orttraining/orttraining/core/optimizer/graph_transformer_utils.h
+++ b/orttraining/orttraining/core/optimizer/graph_transformer_utils.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include <gsl/gsl>
+#include "core/common/gsl.h"
 
 #include "core/optimizer/graph_transformer.h"
 #include "orttraining/core/optimizer/graph_transformer_config.h"
@@ -24,7 +24,7 @@ std::vector<std::unique_ptr<GraphTransformer>> GeneratePreTrainingTransformers(
     const std::unordered_set<std::string>& rules_and_transformers_to_disable = {});
 
 /** Generates all predefined (both rule-based and non-rule-based) transformers for this level.
-    If transformers_and_rules_to_enable is not empty, it returns the intersection between the predefined transformers/rules 
+    If transformers_and_rules_to_enable is not empty, it returns the intersection between the predefined transformers/rules
     and the transformers_and_rules_to_enable. */
 InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
     TransformerLevel level,
diff --git a/orttraining/orttraining/core/session/tensor_helper.cc b/orttraining/orttraining/core/session/tensor_helper.cc
index 0734c55c7c..3e6df9a1c8 100644
--- a/orttraining/orttraining/core/session/tensor_helper.cc
+++ b/orttraining/orttraining/core/session/tensor_helper.cc
@@ -18,15 +18,15 @@ TensorShapeVector GetSliceShape(
   ORT_ENFORCE(shape.size() > 0);
   ORT_ENFORCE(slice_axis < shape.size());
   ORT_ENFORCE(num_slices > 0);
-  ORT_ENFORCE(shape.at(slice_axis) > 0);
-  ORT_ENFORCE(shape.at(slice_axis) % num_slices == 0);
+  ORT_ENFORCE(shape[slice_axis] > 0);
+  ORT_ENFORCE(shape[slice_axis] % num_slices == 0);
 
   // Shape of slice along slice_axis.
   TensorShapeVector slice_shape(shape.size());
   // Compute original slice's shape.
   std::copy(shape.begin(), shape.end(), slice_shape.begin());
   // Replace the sliced dimension.
-  slice_shape.at(slice_axis) = shape.at(slice_axis) / num_slices;
+  slice_shape[slice_axis] = shape[slice_axis] / num_slices;
 
   return slice_shape;
 }
diff --git a/orttraining/orttraining/eager/ort_aten.cpp b/orttraining/orttraining/eager/ort_aten.cpp
index c25a574f57..2247078ab6 100644
--- a/orttraining/orttraining/eager/ort_aten.cpp
+++ b/orttraining/orttraining/eager/ort_aten.cpp
@@ -603,7 +603,8 @@ at::IntArrayRef BroadcastShape(
   auto status = ComputeOutputShape(node_name, ort_tensor_lhs.Shape(), ort_tensor_rhs.Shape(), out_shape);
   CHECK_STATUS(status);
   auto out_shape_dims = out_shape.GetDims();
-  return at::IntArrayRef(&out_shape_dims[0], out_shape_dims.size());
+  return !out_shape_dims.empty() ? at::IntArrayRef(out_shape_dims.data(), out_shape_dims.size())
+                                 : at::IntArrayRef();
 }
 
 // #pragma region Hand-Implemented ATen Ops
diff --git a/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc b/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc
index 751d056f7e..5485ab2c72 100644
--- a/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc
+++ b/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc
@@ -11,6 +11,7 @@
 #include "gtest/gtest.h"
 
 #include "core/common/common.h"
+#include "core/common/span_utils.h"
 #include "core/graph/graph.h"
 #include "core/graph/model.h"
 #include "orttraining/core/graph/gradient_builder_base.h"
@@ -248,7 +249,7 @@ TEST_F(OptimizerGraphBuilderTest, ZeroSplitInitialOptimizerState) {
   for (const auto& state : initial_states) {
     const auto& init_tensor = state.second.Get<Tensor>();
     const auto& shape = init_tensor.Shape().GetDims();
-    ASSERT_EQ(shape, gsl::make_span(expected_shape));
+    ASSERT_TRUE(SpanEq(shape, gsl::make_span(expected_shape)));
     const std::vector<float> found(init_tensor.Data<float>(),
                                    init_tensor.Data<float>() + partition_size);
     ASSERT_EQ(expected_vec, found);
diff --git a/orttraining/orttraining/test/session/training_session_test_utils.cc b/orttraining/orttraining/test/session/training_session_test_utils.cc
index 7677e19f60..2f5a60a9fa 100644
--- a/orttraining/orttraining/test/session/training_session_test_utils.cc
+++ b/orttraining/orttraining/test/session/training_session_test_utils.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "orttraining/test/session/training_session_test_utils.h"
+#include "core/common/span_utils.h"
 #include "orttraining/core/graph/optimizer_builder.h"
 #include "test/util/include/default_providers.h"
 
@@ -115,8 +116,7 @@ void VerifyState(const DataTransferManager& data_transfer_mgr, const NameMLValMa
       // compare "Update_Count" or "Step"
       ASSERT_EQ(actual_tensor.GetElementType(), ONNX_NAMESPACE::TensorProto_DataType_INT64);
       ASSERT_EQ(expected_tensor.Shape(), actual_tensor.Shape());
-      std::array<int64_t, 1> dims = {1};
-      ASSERT_EQ(expected_tensor.Shape().GetDims(), gsl::make_span(dims));
+      ASSERT_TRUE(SpanEq(expected_tensor.Shape().GetDims(), AsSpan<int64_t>({1})));
       auto size = expected_tensor.Shape().Size();
       const std::vector<int64_t> expected(expected_tensor.template Data<int64_t>(), expected_tensor.template Data<int64_t>() + size);
       const std::vector<int64_t> actual(actual_tensor.template Data<int64_t>(), actual_tensor.template Data<int64_t>() + size);
diff --git a/orttraining/orttraining/test/training_api/common/synthetic_data_loader.h b/orttraining/orttraining/test/training_api/common/synthetic_data_loader.h
index ea3399ccc0..542acb2e33 100644
--- a/orttraining/orttraining/test/training_api/common/synthetic_data_loader.h
+++ b/orttraining/orttraining/test/training_api/common/synthetic_data_loader.h
@@ -11,7 +11,7 @@
 
 #pragma once
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #include <onnxruntime_cxx_api.h>
 
diff --git a/orttraining/orttraining/training_ops/cpu/activation/activations_grad.cc b/orttraining/orttraining/training_ops/cpu/activation/activations_grad.cc
index 55cee5a00f..b7958961b7 100644
--- a/orttraining/orttraining/training_ops/cpu/activation/activations_grad.cc
+++ b/orttraining/orttraining/training_ops/cpu/activation/activations_grad.cc
@@ -3,7 +3,7 @@
 
 #include "orttraining/training_ops/cpu/activation/activations_grad.h"
 
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 #if defined(_MSC_VER)
 #pragma warning(push)
diff --git a/orttraining/orttraining/training_ops/cpu/loss/cross_entropy.cc b/orttraining/orttraining/training_ops/cpu/loss/cross_entropy.cc
index d331ca1aea..e2cdf238a0 100644
--- a/orttraining/orttraining/training_ops/cpu/loss/cross_entropy.cc
+++ b/orttraining/orttraining/training_ops/cpu/loss/cross_entropy.cc
@@ -8,7 +8,7 @@
 #include <unsupported/Eigen/SpecialFunctions>
 #include "core/util/math.h"
 #include "core/providers/cpu/math/matmul_helper.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
 namespace contrib {
diff --git a/orttraining/orttraining/training_ops/cpu/loss/softmax_cross_entropy_loss.cc b/orttraining/orttraining/training_ops/cpu/loss/softmax_cross_entropy_loss.cc
index 7b80ea651a..165910d39c 100644
--- a/orttraining/orttraining/training_ops/cpu/loss/softmax_cross_entropy_loss.cc
+++ b/orttraining/orttraining/training_ops/cpu/loss/softmax_cross_entropy_loss.cc
@@ -11,7 +11,7 @@
 #include "core/providers/cpu/controlflow/scan_utils.h"
 #include "orttraining/training_ops/cpu/loss/cross_entropy.h"
 #include "orttraining/training_ops/cpu/loss/softmax_cross_entropy_loss.h"
-#include "gsl/gsl"
+#include "core/common/gsl.h"
 
 namespace onnxruntime {
 namespace contrib {
diff --git a/orttraining/orttraining/training_ops/cpu/op_gradients.cc b/orttraining/orttraining/training_ops/cpu/op_gradients.cc
index e1b02cb7e3..93200f50e7 100644
--- a/orttraining/orttraining/training_ops/cpu/op_gradients.cc
+++ b/orttraining/orttraining/training_ops/cpu/op_gradients.cc
@@ -3,6 +3,7 @@
 
 #include "orttraining/training_ops/cpu/op_gradients.h"
 
+#include "core/common/gsl.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/providers/common.h"
 #include "core/providers/cpu/math/element_wise_ops.h"
@@ -11,7 +12,6 @@
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
 #include <unsupported/Eigen/SpecialFunctions>
-#include "gsl/gsl"
 
 namespace onnxruntime {
 namespace contrib {
diff --git a/orttraining/orttraining/training_ops/cpu/tensor/split.cc b/orttraining/orttraining/training_ops/cpu/tensor/split.cc
index 5f326cbf8e..1f88de6fed 100644
--- a/orttraining/orttraining/training_ops/cpu/tensor/split.cc
+++ b/orttraining/orttraining/training_ops/cpu/tensor/split.cc
@@ -2,12 +2,13 @@
 // Licensed under the MIT License.
 
 #include "orttraining/training_ops/cpu/tensor/split.h"
+
+#include "core/common/gsl.h"
+#include "core/common/narrow.h"
 #include "core/providers/common.h"
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
 
-#include "gsl/gsl"
-
 namespace onnxruntime {
 namespace contrib {
 
@@ -29,11 +30,11 @@ Status PrepareForTrainingCompute(const TensorShape& input_shape, int num_outputs
   axis = HandleNegativeAxis(axis_value, num_dimensions);  // handle negative and enforce axis is valid
   const int64_t split_dim_size = input_dims[axis];
 
-  before_dims = gsl::narrow<int>(input_shape.SizeToDimension(axis));
-  after_dims_including_split_axis = gsl::narrow<int>(input_shape.SizeFromDimension(axis));
+  before_dims = narrow<int>(input_shape.SizeToDimension(axis));
+  after_dims_including_split_axis = narrow<int>(input_shape.SizeFromDimension(axis));
   after_dims_excluding_split = (axis + 1 == num_dimensions)
                                    ? 1  // we multiply by this value so must be 1 not 0
-                                   : gsl::narrow<int>(input_shape.SizeFromDimension(axis + 1));
+                                   : narrow<int>(input_shape.SizeFromDimension(axis + 1));
 
   std::vector<int64_t> split_sizes_values(split_sizes);
   split_sizes.clear();
@@ -125,7 +126,7 @@ Status SplitTraining::ComputeImpl(OpKernelContext& context, const Tensor& input)
 
   for (int i = 0; i < num_outputs; ++i) {
     // update size of dimension for axis we're splitting on
-    auto split_size = gsl::narrow<int>(split_sizes[i]);
+    auto split_size = narrow<int>(split_sizes[i]);
     output_dimensions[axis] = split_size;
 
     Tensor* output = context.Output(i, TensorShape{output_dimensions});
diff --git a/orttraining/orttraining/training_ops/cuda/nn/batch_norm_grad.h b/orttraining/orttraining/training_ops/cuda/nn/batch_norm_grad.h
index c462829f2b..24f13aab81 100644
--- a/orttraining/orttraining/training_ops/cuda/nn/batch_norm_grad.h
+++ b/orttraining/orttraining/training_ops/cuda/nn/batch_norm_grad.h
@@ -3,8 +3,6 @@
 
 #pragma once
 
-#include "gsl/gsl"
-
 #include "core/providers/cuda/cuda_kernel.h"
 #include "core/providers/cuda/cudnn_common.h"
 
diff --git a/orttraining/orttraining/training_ops/cuda/nn/batch_norm_internal.h b/orttraining/orttraining/training_ops/cuda/nn/batch_norm_internal.h
index 3f46c91f22..27a8393dc0 100644
--- a/orttraining/orttraining/training_ops/cuda/nn/batch_norm_internal.h
+++ b/orttraining/orttraining/training_ops/cuda/nn/batch_norm_internal.h
@@ -3,7 +3,6 @@
 
 #pragma once
 
-#include "gsl/gsl"
 #include "core/providers/cuda/cuda_kernel.h"
 #include "core/providers/cuda/cudnn_common.h"
 
diff --git a/orttraining/orttraining/training_ops/rocm/nn/batch_norm_grad.h b/orttraining/orttraining/training_ops/rocm/nn/batch_norm_grad.h
index c94f73881b..63d2370076 100644
--- a/orttraining/orttraining/training_ops/rocm/nn/batch_norm_grad.h
+++ b/orttraining/orttraining/training_ops/rocm/nn/batch_norm_grad.h
@@ -3,8 +3,6 @@
 
 #pragma once
 
-#include "gsl/gsl"
-
 #include "core/providers/rocm/rocm_kernel.h"
 #include "core/providers/rocm/miopen_common.h"
 
diff --git a/orttraining/orttraining/training_ops/rocm/nn/batch_norm_internal.h b/orttraining/orttraining/training_ops/rocm/nn/batch_norm_internal.h
index cb2817951e..d65b66120a 100644
--- a/orttraining/orttraining/training_ops/rocm/nn/batch_norm_internal.h
+++ b/orttraining/orttraining/training_ops/rocm/nn/batch_norm_internal.h
@@ -3,7 +3,6 @@
 
 #pragma once
 
-#include "gsl/gsl"
 #include "core/providers/rocm/rocm_kernel.h"
 #include "core/providers/rocm/miopen_common.h"