From 3367ddc5ba1f71f8416720aebb302bba19e75b79 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Thu, 27 Jan 2022 08:32:05 -0800
Subject: [PATCH] Add abseil cgmanifest declaration. Update coding standards.
 (#10374)

Add abseil cgmanifest declaration. Update coding standards for InlinedContainers
  Adjust coding guidelines. Add default N calculation for InlinedVector<T, N> for general use.
  Rename T from InlinedShapeVectorT. Fix Eager build
  Add LLVM Copyright with modified derived code notice.
---
 cgmanifests/cgmanifest.json                   |  9 ++
 docs/Coding_Conventions_and_Standards.md      |  9 ++
 .../onnxruntime/core/framework/tensor_shape.h |  3 +-
 .../core/framework/inlined_containers.h       | 94 +++++++++++++++++--
 .../core/providers/cpu/controlflow/scan_9.cc  |  4 +-
 .../providers/cpu/controlflow/scan_utils.cc   |  4 +-
 .../providers/cpu/controlflow/scan_utils.h    |  4 +-
 .../einsum_typed_compute_processor.cc         | 12 +--
 .../providers/cpu/reduction/reduction_ops.cc  |  2 +-
 .../core/providers/cpu/tensor/transpose.cc    |  6 +-
 .../core/providers/cpu/tensor/transpose.h     |  6 +-
 .../core/providers/cuda/cudnn_common.cc       |  2 +-
 onnxruntime/core/providers/cuda/nn/pool.cc    |  6 +-
 .../core/providers/cuda/tensor/transpose.cc   |  6 +-
 .../core/providers/rocm/miopen_common.cc      |  6 +-
 onnxruntime/core/providers/rocm/nn/conv.cc    |  6 +-
 .../providers/rocm/reduction/reduction_ops.cc |  2 +-
 onnxruntime/test/framework/data_types_test.cc | 23 +++++
 .../providers/cpu/controlflow/scan_test.cc    |  2 +-
 19 files changed, 162 insertions(+), 44 deletions(-)
diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json
index 479516e0e4..4cc609d272 100644
--- a/cgmanifests/cgmanifest.json
+++ b/cgmanifests/cgmanifest.json
@@ -1,5 +1,14 @@
 {
    "Registrations": [
+      {
+         "component": {
+            "type": "git",
+            "git": {
+               "commitHash": "9336be04a242237cd41a525bedfcf3be1bb55377",
+               "repositoryUrl": "https://github.com/abseil/abseil-cpp.git"
+            }
+         }
+      },   
       {
         "component": {
           "Type": "maven",
diff --git a/docs/Coding_Conventions_and_Standards.md b/docs/Coding_Conventions_and_Standards.md
index 26eec5de0f..b3331ed50d 100644
--- a/docs/Coding_Conventions_and_Standards.md
+++ b/docs/Coding_Conventions_and_Standards.md
@@ -23,6 +23,15 @@ Other
 * When adding a new class, disable copy/assignment/move until you have a proven need for these capabilities. If a need arises, enable copy/assignment/move selectively, and when doing so validate that the implementation of the class supports what is being enabled.
   * Use ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE initially
   * See the other ORT_DISALLOW_* macros in https://github.com/microsoft/onnxruntime/blob/master/include/onnxruntime/core/common/common.h
+* Consider using 'const gsl::span<const T>&' (or 'std::span' when supported) as input arguments when passing const references to containers with contiguous storage (like 'std::vector'). This allows to make the function container independent, represent arbitrary memory spans or pass sub-spans as an argument.
+* The use of the following container typedefs to reduce memory allocations is preferred:
+  * Use 'TensorShapeVector' typedef to build or modify shapes from core/framework/tensor_shape.h. It is based on a vector implementation that features small buffer optimization. Its small buffer size is the same to that of in TensorShape. Use 'InlinedShapeVector<T>'    for shape related operations, but of different type.
+  * Use 'InlinedVector<T>' typedef instead of std::vector. By default, it provides 64 bytes of inlined storage. You can customize inlined size with the second template non-type parameter N.
+  * Use 'InlinedHashSet<T>' and 'InlinedHashMap<T>' typedefs from core/framework/inlined_containers.h. These are drop-in replacements for 'std::unordered_set/map' that store their keys and values in one continuous buffer and reduce the number of allocations. They also do not allocate an 'end' node. Note, that these Hash containers do not provide pointer stability.
+  * Consider using 'std::string_view' to use in maps and sets to reduce the number of allocations and avoid string duplication. Keep in mind that the strings referred to must be alive.
+  * We have selected to use Abseil library for the above typedefs. Abseil container documentation is [here](https://abseil.io/docs/cpp/guides/container#abseil-containers).
+* Prefer using `reserve()` and not `resize()` on vectors. 'resize()' default constructs all the elements for the size which can be expensive/noticiable even if the type is trivial. Default values are rarely used in practice and it becomes a waste. Construction like 'std::vector<int>(10, 0)' is the same as 'resize()' and is potentially wasteful.
+* Use `reserve()` on hash containers or pass the number of items in the constructor.  
 * Don't use else after return. see: [https://llvm.org/docs/CodingStandards.html#don-t-use-else-after-a-return](https://llvm.org/docs/CodingStandards.html#don-t-use-else-after-a-return)
 * Don't overuse std::shared\_ptr. Use std::shared\_ptr only if it's not clear when and where the object will be deallocated. See also: [https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#Rf-shared_ptr](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#Rf-shared_ptr)
 * Avoid using the 'long' type, which could be either 32 bits or 64 bits.
diff --git a/include/onnxruntime/core/framework/tensor_shape.h b/include/onnxruntime/core/framework/tensor_shape.h
index 01de9e43ee..4db6b58adc 100644
--- a/include/onnxruntime/core/framework/tensor_shape.h
+++ b/include/onnxruntime/core/framework/tensor_shape.h
@@ -37,7 +37,8 @@ using TensorShapeVector = absl::InlinedVector<int64_t, kTensorShapeSmallBufferEl
 
 // Use this for inlined shape size where different types are needed.
 template <typename T>
-using InlinedShapeVectorT = absl::InlinedVector<T, kTensorShapeSmallBufferElementsSize>;
+using InlinedShapeVector = absl::InlinedVector<T, kTensorShapeSmallBufferElementsSize>;
+
 
 inline TensorShapeVector ToShapeVector(const gsl::span<const int64_t>& span) {
   TensorShapeVector out;
diff --git a/onnxruntime/core/framework/inlined_containers.h b/onnxruntime/core/framework/inlined_containers.h
index dfe35555e5..35d13e7495 100644
--- a/onnxruntime/core/framework/inlined_containers.h
+++ b/onnxruntime/core/framework/inlined_containers.h
@@ -1,6 +1,17 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file contains code and comments derived from llvm/ADT/SmallVector.h
+// 
+// Specifically CalculateInlinedVectorDefaultInlinedElements<T>() template is derived from 
+// CalculateSmallVectorDefaultInlinedElements<T>() and its comments.
+
 #pragma once
 
 #include <cmath>
@@ -20,21 +31,86 @@
 #endif
 
 namespace onnxruntime {
+/// Inspired by LLVM SmallVector with ONNX Runtime adjuments for abseil.
+///
+/// Helper class for calculating the default number of inline elements for
+/// `InlinedVector<T>`.
+/// This produces the following on MSVC x64
+///    int8_t  -> 41
+//     int16_t -> 21
+//     int32_t -> 11 
+//     int64_t -> 6
+//     std::string 40 -> 1
+template<typename T>
+struct CalculateInlinedVectorDefaultInlinedElements {
+  // Parameter controlling the default number of inlined elements
+  // for `InlinedVector<T>`.
+  //
+  // The default number of inlined elements ensures that
+  // 1. There is at least one inlined element.
+  // 2. `sizeof(InlinedVector<T>) <= kPreferredInlinedVectorSizeof` unless
+  // it contradicts 1.
+  static constexpr size_t kPreferredInlinedVectorSizeof = 64;
 
-// Use InlinedVector for small arrays that can fit on a stack.
+  // static_assert that sizeof(T) is not "too big".
+  //
+  // Because the InlinedVector must have at least one inlined element, it is possible
+  // for an arbitrarily large inlined element to allocate an arbitrarily large
+  // amount of inline storage. So we want to call attention to these cases and
+  // make sure that users are making an intentional decision if they request a lot of inline storage.
+  //
+  // We want this assertion to trigger in pathological cases, but otherwise
+  // not be too easy to hit. To accomplish that, the cutoff is actually somewhat
+  // larger than kPreferredInlinedVectorSizeof (otherwise,
+  // `InlinedVector<InlinedVector<T>>` would be one easy way to trip it, and that
+  // pattern seems useful in practice).
+  //
+  // One wrinkle is that this assertion is in theory non-portable, since
+  // sizeof(absl::InlinedVector<T, 1>) is in general platform-dependent. However, we don't expect this
+  // to be much of an issue, because most LLVM development happens on 64-bit
+  // hosts, and therefore sizeof(T) is expected to *decrease* when compiled for
+  // 32-bit hosts, dodging the issue. The reverse situation, where development
+  // happens on a 32-bit host and then fails due to sizeof(T) *increasing* on a
+  // 64-bit host, is expected to be very rare.
+  static_assert(
+      sizeof(absl::InlinedVector<T, 1>) <= kPreferredInlinedVectorSizeof,
+      "You are trying to use a default number of inlined elements for "
+      "`InlinedVector<T>` but `sizeof(T)` is really big! Please use an "
+      "explicit number of inlined elements with `InlinedVector<T, N>` to make "
+      "sure you really want that much inline storage.");
+
+  // Discount the size of the header itself when calculating the maximum inline
+  // bytes.
+  static constexpr size_t PreferredInlineBytes =
+      kPreferredInlinedVectorSizeof - (sizeof(absl::InlinedVector<T, 1>) - sizeof(T));
+  static constexpr size_t NumElementsThatFit = PreferredInlineBytes / sizeof(T);
+  static constexpr size_t value =
+      NumElementsThatFit == 0 ? 1 : NumElementsThatFit;
+};
+
+// Use InlinedVector for small arrays that can fit on a stack with a default
+// value pre-calculated.
 // Use TensorShapeVector for shapes.
-template <typename T, size_t N>
-using InlinedVector = absl::InlinedVector<T, N>;
+template <typename T, 
+          size_t N = CalculateInlinedVectorDefaultInlinedElements<T>::value,
+          typename Allocator = std::allocator<T>>
+using InlinedVector = absl::InlinedVector<T, N, Allocator>;
 
 // InlinedHashSet and InlinedHashMap are preferred
 // hash based containers. They store their values in the
 // buckets array that is allocated in one shot. It eliminates
-// per-node new/delete calls. Always call reserve() on any set/map
-// be it a std container or not.
-template <typename T>
-using InlinedHashSet = absl::flat_hash_set<T>;
+// per-node new/delete calls. Always call reserve() on any hash set/map
+// when the number of items is known in advance
+template <typename T, 
+          typename Hash = absl::container_internal::hash_default_hash<T>,
+          typename Eq = absl::container_internal::hash_default_eq<T>,
+          typename Allocator = std::allocator<T>>
+using InlinedHashSet = absl::flat_hash_set<T, Hash, Eq, Allocator>;
 
-template <typename K, typename V>
-using InlinedHashMap = absl::flat_hash_map<K, V>;
+template <typename K, typename V,
+          typename Hash = absl::container_internal::hash_default_hash<K>,
+          typename Eq = absl::container_internal::hash_default_eq<K>,
+          typename Allocator = std::allocator<std::pair<const K, V>>>
+using InlinedHashMap = absl::flat_hash_map<K, V, Hash, Eq, Allocator>;
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_9.cc b/onnxruntime/core/providers/cpu/controlflow/scan_9.cc
index 8bd03e6e47..28518f0aa9 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan_9.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_9.cc
@@ -345,7 +345,7 @@ Status ScanImpl::SetupInputs() {
       auto& input_tensor = *context_.Input<Tensor>(i + info_.num_loop_state_variables);
       const auto& input_shape = input_tensor.Shape();
 
-      InlinedShapeVectorT<size_t> permutations;
+      InlinedShapeVector<size_t> permutations;
       TensorShapeVector new_shape;
       CalculateTransposedShapeForInput(input_shape, sequence_dim, permutations, new_shape);
 
@@ -478,7 +478,7 @@ Status ScanImpl::TransposeOutput() {
         return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid value in scan_output_axes for output ", i,
                                " of ", axis, ". Output tensor rank was ", output_rank);
 
-      InlinedShapeVectorT<size_t> permutations;
+      InlinedShapeVector<size_t> permutations;
       TensorShapeVector new_shape;
       CalculateTransposedShapeForOutput(temporary_output_tensor.Shape(), axis, permutations, new_shape);
 
diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc b/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc
index ea674953f4..0885f924cc 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc
@@ -300,7 +300,7 @@ OrtValue AllocateTensorInMLValue(const MLDataType data_type, const TensorShape&
 };
 
 void CalculateTransposedShapeForInput(const TensorShape& original_shape, int64_t axis,
-                                      InlinedShapeVectorT<size_t>& permutations, TensorShapeVector& transposed_shape) {
+                                      InlinedShapeVector<size_t>& permutations, TensorShapeVector& transposed_shape) {
   int64_t rank = original_shape.NumDimensions();
   const auto& dims = original_shape.GetDims();
 
@@ -319,7 +319,7 @@ void CalculateTransposedShapeForInput(const TensorShape& original_shape, int64_t
 }
 
 void CalculateTransposedShapeForOutput(const TensorShape& original_shape, int64_t axis,
-                                       InlinedShapeVectorT<size_t>& permutations, TensorShapeVector& transposed_shape) {
+                                       InlinedShapeVector<size_t>& permutations, TensorShapeVector& transposed_shape) {
   int64_t rank = original_shape.NumDimensions();
   const auto& dims = original_shape.GetDims();
 
diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_utils.h b/onnxruntime/core/providers/cpu/controlflow/scan_utils.h
index dbd11bd9b8..ede2d42a8c 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan_utils.h
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_utils.h
@@ -196,7 +196,7 @@ e.g. if shape is {2, 3, 4} and axis 1 is chosen the permutations will be {1, 0,
      if axis 2 is chosen the permutations will be {2, 0, 1} and the output shape will be {4, 2, 3}
 */
 void CalculateTransposedShapeForInput(const TensorShape& original_shape, int64_t axis,
-                                      InlinedShapeVectorT<size_t>& permutations, TensorShapeVector& transposed_shape);
+                                      InlinedShapeVector<size_t>& permutations, TensorShapeVector& transposed_shape);
 
 /**
 Calculate the transpose permutations and shape by shifting the chosen axis FROM the first dimension.
@@ -205,7 +205,7 @@ e.g. if shape is {4, 2, 3} and axis 2 is chosen, dimension 0 will move to dimens
      the permutations will be {1, 2, 0} and output shape will be {2, 3, 4}
 */
 void CalculateTransposedShapeForOutput(const TensorShape& original_shape, int64_t axis,
-                                       InlinedShapeVectorT<size_t>& permutations, TensorShapeVector& transposed_shape);
+                                       InlinedShapeVector<size_t>& permutations, TensorShapeVector& transposed_shape);
 
 }  // namespace detail
 }  // namespace scan
diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc
index 8645f19545..00b78cf17f 100644
--- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc
+++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc
@@ -123,13 +123,13 @@ std::unique_ptr<Tensor> EinsumTypedComputeProcessor<T>::PairwiseOperandProcess(c
   // lro: dim indices that are present in left, right, and reduce_dims
   // lo: dim indices that are present in left and reduce_dims
   // ro: dim indices that are present in right and reduce_dims
-  InlinedShapeVectorT<size_t> lro;
+  InlinedShapeVector<size_t> lro;
   lro.reserve(kTensorShapeSmallBufferElementsSize);  // Reserve an arbitrary amount of space for this vector (not bound to see a tensor of rank > kTensorShapeSmallBufferElementsSize)
 
-  InlinedShapeVectorT<size_t> lo;
+  InlinedShapeVector<size_t> lo;
   lo.reserve(kTensorShapeSmallBufferElementsSize);  // Reserve an arbitrary amount of space for this vector (not bound to see a tensor of rank > kTensorShapeSmallBufferElementsSize)
 
-  InlinedShapeVectorT<size_t> ro;
+  InlinedShapeVector<size_t> ro;
   ro.reserve(kTensorShapeSmallBufferElementsSize);  // Reserve an arbitrary amount of space for this vector (not bound to see a tensor of rank > kTensorShapeSmallBufferElementsSize)
 
   // Maintain sizes to create reshaped "views"
@@ -193,7 +193,7 @@ std::unique_ptr<Tensor> EinsumTypedComputeProcessor<T>::PairwiseOperandProcess(c
 
   // Permutate the left operand so that the axes order go like this: [lro, lo, reduce_dims, ro]
   TensorShapeVector reshaped_dims;
-  InlinedShapeVectorT<size_t> left_permutation;
+  InlinedShapeVector<size_t> left_permutation;
   left_permutation.reserve(lro.size() + lo.size() + reduce_dims.size() + ro.size());
   left_permutation.insert(left_permutation.end(), lro.begin(), lro.end());
   left_permutation.insert(left_permutation.end(), lo.begin(), lo.end());
@@ -219,7 +219,7 @@ std::unique_ptr<Tensor> EinsumTypedComputeProcessor<T>::PairwiseOperandProcess(c
   }
 
   // Permutate the right operand so that the axes order go like this: [lro, reduce_dims, ro, lo]
-  InlinedShapeVectorT<size_t> right_permutation;
+  InlinedShapeVector<size_t> right_permutation;
   right_permutation.reserve(lro.size() + lo.size() + reduce_dims.size() + ro.size());
   right_permutation.insert(right_permutation.end(), lro.begin(), lro.end());
   right_permutation.insert(right_permutation.end(), reduce_dims.begin(), reduce_dims.end());
@@ -273,7 +273,7 @@ std::unique_ptr<Tensor> EinsumTypedComputeProcessor<T>::PairwiseOperandProcess(c
   // the output is permutated as well with respect to the original ordering of the axes.
   // The permutated order will be the dims in: [lro, lo, reduced_dims, ro]
   // Hence invert the permutation by a permutation that puts the axes in the same ordering
-  InlinedShapeVectorT<size_t> output_permutation;
+  InlinedShapeVector<size_t> output_permutation;
   if (!is_final_pair) {  // If this is not the final pair, we need to permutate the result to match the pre-fixed order for the next iteration
     output_permutation.resize(lro.size() + lo.size() + reduce_dims.size() + ro.size(), 0);
     size_t iter = 0;
diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
index 73c9edf7ab..97db23c50a 100644
--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
@@ -562,7 +562,7 @@ FastReduceKind OptimizeShapeForFastReduce(gsl::span<const int64_t> input_shape,
   fast_output_shape.clear();
   fast_output_shape.reserve(input_shape_size);
   bool empty_reduce = false;
-  InlinedShapeVectorT<bool> reduce(input_shape_size);
+  InlinedShapeVector<bool> reduce(input_shape_size);
   for (int64_t i = 0; i < input_shape_size; ++i) {
     reduce[i] = axes.find(i) != axes.end();
     if (reduce[i]) {
diff --git a/onnxruntime/core/providers/cpu/tensor/transpose.cc b/onnxruntime/core/providers/cpu/tensor/transpose.cc
index 437ba45682..dca8c381e3 100644
--- a/onnxruntime/core/providers/cpu/tensor/transpose.cc
+++ b/onnxruntime/core/providers/cpu/tensor/transpose.cc
@@ -247,7 +247,7 @@ static Status DoUntypedTranspose(const gsl::span<const size_t>& permutations, co
   const auto element_size = input.DataType()->Size();
   const bool is_string_type = input.IsDataTypeString();
 
-  InlinedShapeVectorT<size_t> stride(rank);
+  InlinedShapeVector<size_t> stride(rank);
   for (size_t i = 0; i < rank; i++) {
     size_t inpdim = permutations[i];
     if (inpdim + 1 < rank)
@@ -709,8 +709,8 @@ Status Transpose::Compute(OpKernelContext* ctx) const {
   size_t rank = input_dims.size();
 
   TensorShapeVector output_dims(rank);
-  const InlinedShapeVectorT<size_t>* p_perm;
-  InlinedShapeVectorT<size_t> default_perm(rank);
+  const InlinedShapeVector<size_t>* p_perm;
+  InlinedShapeVector<size_t> default_perm(rank);
   Status status = ComputeOutputShape(X, output_dims, default_perm, p_perm);
   if (!status.IsOK())
     return status;
diff --git a/onnxruntime/core/providers/cpu/tensor/transpose.h b/onnxruntime/core/providers/cpu/tensor/transpose.h
index 61cfe8280c..ef55972b38 100644
--- a/onnxruntime/core/providers/cpu/tensor/transpose.h
+++ b/onnxruntime/core/providers/cpu/tensor/transpose.h
@@ -58,8 +58,8 @@ class TransposeBase {
     }
   }
 
-  Status ComputeOutputShape(const Tensor& X, TensorShapeVector& output_dims, InlinedShapeVectorT<size_t>& default_perm,
-                            const InlinedShapeVectorT<size_t>*& p_perm) const {
+  Status ComputeOutputShape(const Tensor& X, TensorShapeVector& output_dims, InlinedShapeVector<size_t>& default_perm,
+                            const InlinedShapeVector<size_t>*& p_perm) const {
     size_t rank = X.Shape().NumDimensions();
     const auto& input_dims = X.Shape().GetDims();
 
@@ -93,7 +93,7 @@ class TransposeBase {
   }
 
   bool perm_specified_ = false;
-  InlinedShapeVectorT<size_t> perm_;
+  InlinedShapeVector<size_t> perm_;
 };
 
 class Transpose final : public OpKernel, public TransposeBase {
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc
index c789031057..ed1d792a52 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.cc
+++ b/onnxruntime/core/providers/cuda/cudnn_common.cc
@@ -100,7 +100,7 @@ Status CudnnFilterDescriptor::Set(gsl::span<const int64_t> filter_dims, cudnnDat
     CUDNN_RETURN_IF_ERROR(cudnnCreateFilterDescriptor(&desc_));
 
   int rank = gsl::narrow_cast<int>(filter_dims.size());
-  InlinedShapeVectorT<int> w_dims(rank);
+  InlinedShapeVector<int> w_dims(rank);
   for (int i = 0; i < rank; i++) {
     w_dims[i] = gsl::narrow_cast<int>(filter_dims[i]);
   }
diff --git a/onnxruntime/core/providers/cuda/nn/pool.cc b/onnxruntime/core/providers/cuda/nn/pool.cc
index 1670e30438..2d0e115f10 100644
--- a/onnxruntime/core/providers/cuda/nn/pool.cc
+++ b/onnxruntime/core/providers/cuda/nn/pool.cc
@@ -90,9 +90,9 @@ class CudnnPoolingDescriptor final {
       CUDNN_RETURN_IF_ERROR(cudnnCreatePoolingDescriptor(&desc_));
 
     int rank = gsl::narrow_cast<int>(kernel_shape.size());
-    InlinedShapeVectorT<int> window(rank);
-    InlinedShapeVectorT<int> padding(rank);
-    InlinedShapeVectorT<int> stride(rank);
+    InlinedShapeVector<int> window(rank);
+    InlinedShapeVector<int> padding(rank);
+    InlinedShapeVector<int> stride(rank);
     for (int i = 0; i < rank; i++) {
       window[i] = gsl::narrow_cast<int>(kernel_shape[i]);
     }
diff --git a/onnxruntime/core/providers/cuda/tensor/transpose.cc b/onnxruntime/core/providers/cuda/tensor/transpose.cc
index 71eae4cf61..803dfbf9a5 100644
--- a/onnxruntime/core/providers/cuda/tensor/transpose.cc
+++ b/onnxruntime/core/providers/cuda/tensor/transpose.cc
@@ -98,7 +98,7 @@ Status Transpose::DoTranspose(const cudaDeviceProp& prop,
   // flatten the adjacent dimensions which are contiguous
   // for example: permutations[0, 2, 3, 1] -> [0, 2, 1], permutations[0, 3, 1, 2] -> [0, 2, 1]
   auto new_rank = rank;
-  InlinedShapeVectorT<size_t> new_permutations(permutations.cbegin(), permutations.cend());
+  InlinedShapeVector<size_t> new_permutations(permutations.cbegin(), permutations.cend());
   TensorShapeVector new_input_dims = ToShapeVector(input_dims);
   TensorShapeVector new_output_dims = ToShapeVector(output_dims);
 
@@ -265,8 +265,8 @@ Status Transpose::ComputeInternal(OpKernelContext* ctx) const {
   int32_t rank = gsl::narrow_cast<int32_t>(input_shape.NumDimensions());
 
   TensorShapeVector output_dims(rank);
-  InlinedShapeVectorT<size_t> default_perm(rank);
-  const InlinedShapeVectorT<size_t>* p_perm = nullptr;
+  InlinedShapeVector<size_t> default_perm(rank);
+  const InlinedShapeVector<size_t>* p_perm = nullptr;
   const auto& status = ComputeOutputShape(X, output_dims, default_perm, p_perm);
   if (!status.IsOK())
     return status;
diff --git a/onnxruntime/core/providers/rocm/miopen_common.cc b/onnxruntime/core/providers/rocm/miopen_common.cc
index 7b44b6069c..3de6c408cb 100644
--- a/onnxruntime/core/providers/rocm/miopen_common.cc
+++ b/onnxruntime/core/providers/rocm/miopen_common.cc
@@ -31,8 +31,8 @@ Status MiopenTensor::Set(gsl::span<const int64_t> input_dims, miopenDataType_t d
 
   int rank = gsl::narrow_cast<int>(input_dims.size());
   TensorPitches pitches(input_dims);
-  InlinedShapeVectorT<int> dims(rank);
-  InlinedShapeVectorT<int> strides(rank);
+  InlinedShapeVector<int> dims(rank);
+  InlinedShapeVector<int> strides(rank);
   for (int i = 0; i < rank; i++) {
     dims[i] = gsl::narrow_cast<int>(input_dims[i]);
     strides[i] = gsl::narrow_cast<int>(pitches[i]);
@@ -63,7 +63,7 @@ Status MiopenTensorDescriptor::Set(gsl::span<const int64_t> filter_dims, miopenD
     MIOPEN_RETURN_IF_ERROR(miopenCreateTensorDescriptor(&desc_));
 
   int rank = gsl::narrow_cast<int>(filter_dims.size());
-  InlinedShapeVectorT<int> w_dims(rank);
+  InlinedShapeVector<int> w_dims(rank);
   for (int i = 0; i < rank; i++) {
     w_dims[i] = gsl::narrow_cast<int>(filter_dims[i]);
   }
diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc
index 234040d11d..7f90a90545 100644
--- a/onnxruntime/core/providers/rocm/nn/conv.cc
+++ b/onnxruntime/core/providers/rocm/nn/conv.cc
@@ -333,9 +333,9 @@ Status MiopenConvolutionDescriptor::Set(
   if (!desc_)
     MIOPEN_RETURN_IF_ERROR(miopenCreateConvolutionDescriptor(&desc_));
 
-  InlinedShapeVectorT<int> pad_dims(rank);
-  InlinedShapeVectorT<int> stride_dims(rank);
-  InlinedShapeVectorT<int> dilation_dims(rank);
+  InlinedShapeVector<int> pad_dims(rank);
+  InlinedShapeVector<int> stride_dims(rank);
+  InlinedShapeVector<int> dilation_dims(rank);
   for (size_t i = 0; i < rank; i++) {
     pad_dims[i] = gsl::narrow_cast<int>(pads[i]);
     stride_dims[i] = gsl::narrow_cast<int>(strides[i]);
diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
index 35f17076dd..9258beb423 100644
--- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
@@ -378,7 +378,7 @@ Status PrepareForReduce(const Tensor* X,
   }
 
   const auto input_dims = input_shape.GetDims();
-  InlinedShapeVectorT<bool> reduced(rank, false);
+  InlinedShapeVector<bool> reduced(rank, false);
   prepare_reduce_metadata.output_dims.reserve(input_dims.size());
   if (axes.size() > 0) {
     prepare_reduce_metadata.output_dims = input_shape.AsShapeVector();
diff --git a/onnxruntime/test/framework/data_types_test.cc b/onnxruntime/test/framework/data_types_test.cc
index 9d6a564404..2fed5725b9 100644
--- a/onnxruntime/test/framework/data_types_test.cc
+++ b/onnxruntime/test/framework/data_types_test.cc
@@ -6,6 +6,7 @@
 
 #include "core/framework/data_types.h"
 #include "core/framework/data_types_internal.h"
+#include "core/framework/inlined_containers.h"
 #include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
@@ -665,5 +666,27 @@ TEST_F(DataTypeTest, DataUtilsTest) {
   }
 }
 
+template<typename T>
+using Calc = CalculateInlinedVectorDefaultInlinedElements<T>;
+
+template <typename... Types>
+struct TypeMinimunInlinedElements {
+  std::array<std::pair<size_t, size_t>, sizeof...(Types)> sizes_{std::make_pair(sizeof(Types), Calc<Types>::value)...};
+  void print(std::ostream& os) const {
+    os << " CalculateInlinedVectorDefaultInlinedElements Sizes: ";
+    for (auto& p : sizes_) {
+      os << p.first << " -> " << p.second << std::endl;
+    }
+    os << std::endl;
+  }
+};
+
+TEST(InlinedVectorTests, TestDefaultInlinedCapacity) {
+
+  // We want to test all the type here
+  TypeMinimunInlinedElements<int8_t, int16_t, int32_t, int64_t, std::string> sizes;
+  sizes.print(std::cout);
+
+}
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
index d2a8196770..eb7a9db752 100644
--- a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
@@ -390,7 +390,7 @@ static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_
 
       // skip if this is an invalid input test and axis is out of the valid range
       if (axis >= -rank && axis < rank) {
-        InlinedShapeVectorT<size_t> permutations;
+        InlinedShapeVector<size_t> permutations;
         TensorShapeVector new_shape;
         scan::detail::CalculateTransposedShapeForOutput(output_shape, HandleNegativeAxis(axis, rank),
                                                         permutations, new_shape);