diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index 6071bfd022..1619402cab 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -842,37 +842,93 @@ common::Status ConstantNodeProtoToTensorProto(const ONNX_NAMESPACE::NodeProto& n
 }
 
 #if !defined(DISABLE_SPARSE_TENSORS)
-template <typename T>
 static Status CopySparseData(size_t n_sparse_elements,
                              const ONNX_NAMESPACE::TensorProto& indices,
+                             const Path& model_path,
                              gsl::span<const int64_t> dims,
                              std::function<void(size_t from_idx, size_t to_idx)> copier) {
   Status status = Status::OK();
   TensorShape indices_shape(indices.dims().data(), indices.dims().size());
+  const auto elements = gsl::narrow<size_t>(indices_shape.Size());
 
-  ORT_RETURN_IF_NOT(indices.data_type() == ONNX_NAMESPACE ::TensorProto_DataType_INT64, "Indicies expected to be INT64");
-
+  std::vector<int64_t> indices_values;  // used for conversion of smaller size indices
+  std::vector<uint8_t> unpack_buffer;
   gsl::span<const int64_t> indices_data;
-  const auto elements = static_cast<size_t>(indices_shape.Size());
-  if (indices.int64_data_size() > 0) {
-    indices_data = gsl::make_span<const int64_t>(indices.int64_data().data(), elements);
-  } else if (indices.has_raw_data()) {
-    ORT_RETURN_IF_NOT(indices.raw_data().size() == (elements * sizeof(int64_t)),
-                      "Sparse Indicies raw data size does not match expected.");
-    indices_data = gsl::make_span<const int64_t>(reinterpret_cast<const int64_t*>(indices.raw_data().data()), elements);
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "Invalid SparseTensor indices. Should either have raw or int64 data");
+  const bool has_raw_data = indices.has_raw_data();
+  switch (indices.data_type()) {
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64:
+      if (has_raw_data) {
+        ORT_RETURN_IF_NOT(indices.raw_data().size() == (elements * sizeof(int64_t)),
+                          "Sparse Indices raw data size does not match expected.");
+        ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer));
+        indices_data = gsl::make_span(unpack_buffer).as_span<const int64_t>();
+      } else {
+        ORT_RETURN_IF_NOT(indices.int64_data_size() == static_cast<int64_t>(elements), "Sparse indices int64 data size does not match expected");
+        indices_data = gsl::make_span(indices.int64_data().data(), elements);
+      }
+      break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
+      if (has_raw_data) {
+        ORT_RETURN_IF_NOT(indices.raw_data().size() == (elements * sizeof(int32_t)),
+                          "Sparse Indices raw data size does not match expected.");
+        ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer));
+        auto int32_span = gsl::make_span(unpack_buffer).as_span<const int32_t>();
+        indices_values.insert(indices_values.cend(), int32_span.cbegin(), int32_span.cend());
+        unpack_buffer.clear();
+        unpack_buffer.shrink_to_fit();
+      } else {
+        ORT_RETURN_IF_NOT(indices.int32_data_size() == static_cast<int64_t>(elements), "Sparse indices int32 data size does not match expected");
+        indices_values.insert(indices_values.cend(), indices.int32_data().cbegin(), indices.int32_data().cend());
+      }
+      indices_data = gsl::make_span(indices_values);
+      break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_INT16: {
+      if (has_raw_data) {
+        ORT_RETURN_IF_NOT(indices.raw_data().size() == (elements * sizeof(int16_t)),
+                          "Sparse Indices raw data size does not match expected.");
+        ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer));
+        auto int16_span = gsl::make_span(unpack_buffer).as_span<const int16_t>();
+        indices_values.insert(indices_values.cend(), int16_span.cbegin(), int16_span.cend());
+        indices_data = gsl::make_span(indices_values);
+        unpack_buffer.clear();
+        unpack_buffer.shrink_to_fit();
+      } else {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH,
+                               "Invalid SparseTensor indices. INT16 indices must be in the raw data of indices tensor");
+      }
+      break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8: {
+      if (has_raw_data) {
+        ORT_RETURN_IF_NOT(indices.raw_data().size() == elements,
+                          "Sparse Indices raw data size does not match expected.");
+        ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer));
+        auto int8_span = gsl::make_span(unpack_buffer).as_span<const int8_t>();
+        indices_values.insert(indices_values.cend(), int8_span.cbegin(), int8_span.cend());
+        indices_data = gsl::make_span(indices_values);
+        unpack_buffer.clear();
+        unpack_buffer.shrink_to_fit();
+      } else {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH,
+                               "Invalid SparseTensor indices. INT8 indices must be in the raw data of indices tensor");
+      }
+      break;
+    }
+    default:
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH,
+                             "Invalid SparseTensor indices. Should one of the following types: int8, int16, int32 or int64");
   }
 
   if (indices_shape.NumDimensions() == 1) {
     // flattened indexes
     for (size_t i = 0; i < n_sparse_elements; ++i) {
-      copier(i, static_cast<size_t>(indices_data[i]));
+      copier(i, gsl::narrow<size_t>(indices_data[i]));
     }
   } else if (indices_shape.NumDimensions() == 2) {
     // entries in format {NNZ, rank}
-    size_t rank = static_cast<size_t>(indices_shape[1]);
-    ORT_ENFORCE(rank == dims.size() && rank > 0);
+    ORT_ENFORCE(indices_shape[1] > 0 && static_cast<size_t>(indices_shape[1]) == dims.size());
+    auto rank = static_cast<size_t>(indices_shape[1]);
     const int64_t* cur_index = indices_data.data();
     std::vector<size_t> multipliers;
     multipliers.resize(rank);
@@ -880,20 +936,20 @@ static Status CopySparseData(size_t n_sparse_elements,
     // calculate sum of inner dimension elements for each dimension.
     // e.g. if shape {2,3,4}, the result should be {3*4, 4, 1}
     multipliers[rank - 1] = 1;
-    for (int32_t r = static_cast<int32_t>(rank) - 2; r >= 0; --r) {
-      multipliers[r] = static_cast<size_t>(dims[r + 1]) * multipliers[r + 1];
+    for (auto r = rank - 1; r > 0; --r) {
+      multipliers[r - 1] = SafeInt<size_t>(dims[r]) * multipliers[r];
     }
 
     // calculate the offset for the entry
     // e.g. if shape was {2,3,4} and entry was (1, 0, 2) the offset is 14
     // as there are 2 rows, each with 12 entries per row
     for (size_t i = 0; i < n_sparse_elements; ++i) {
-      size_t idx = 0;
+      SafeInt<int64_t> idx = 0;
       for (size_t j = 0; j < rank; ++j) {
-        idx += static_cast<size_t>(cur_index[j]) * multipliers[j];
+        idx += SafeInt<int64_t>(cur_index[j]) * multipliers[j];
       }
 
-      copier(i, idx);
+      copier(i, static_cast<size_t>(idx));
       cur_index += rank;
     }
 
@@ -905,29 +961,7 @@ static Status CopySparseData(size_t n_sparse_elements,
 
   return status;
 }
-#endif  // !defined(DISABLE_SPARSE_TENSORS)
 
-namespace conversion_internal {
-#if !defined(DISABLE_SPARSE_TENSORS)
-struct UnsupportedSparseDataType {
-  void operator()(int32_t dt_type, Status& status) const {
-    status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported sparse tensor data type of ", dt_type);
-  }
-};
-#endif
-template <typename T>
-struct GetElementSize {
-  Status operator()(size_t& element_size) const {
-    element_size = sizeof(T);
-    return Status::OK();
-  }
-};
-
-using SupportedConversionTypeList = onnxruntime::TypeList<float, double, MLFloat16, BFloat16,
-                                                          int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t>;
-}  // namespace conversion_internal
-
-#if !defined(DISABLE_SPARSE_TENSORS)
 common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseTensorProto& sparse,
                                                    const Path& model_path,
                                                    ONNX_NAMESPACE::TensorProto& dense) {
@@ -953,74 +987,70 @@ common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseT
   auto dims = gsl::make_span<const int64_t>(dense.dims().data(), dense.dims().size());
 
   if (type != TensorProto_DataType_STRING) {
+    auto ml_data = DataTypeImpl::TensorTypeFromONNXEnum(type)->GetElementType();
+    size_t element_size = ml_data->Size();
+
     // need to read in sparse data first as it could be in a type specific field, in raw data, or in external data
     std::vector<uint8_t> sparse_data_storage;
     ORT_RETURN_IF_ERROR(UnpackInitializerData(sparse_values, model_path, sparse_data_storage));
     void* sparse_data = sparse_data_storage.data();
-    size_t element_size = 0;
-    // We want to this list to match the one used below in DenseTensorToSparseTensorProto()
-    MLTypeCallDispatcherFromTypeList<conversion_internal::SupportedConversionTypeList> type_disp(type);
-    ORT_RETURN_IF_ERROR(
-        (type_disp.InvokeRetWithUnsupportedPolicy<Status, conversion_internal::GetElementSize, conversion_internal::UnsupportedSparseDataType>(element_size)));
 
     // by putting the data into a std::string we can avoid a copy as set_raw_data can do a std::move
-    // into the TensorProto. however to actually write to the buffer we have created in the std::string we need
-    // this somewhat dirty hack to get a mutable pointer. we could alternatively use &dense_data_storage.front()
-    // but using const_cast makes it more obvious we're doing something ugly.
-    // C++17 add non-const data() where we could remove const_cast
+    // into the TensorProto.
     std::string dense_data_storage(n_dense_elements * element_size, 0);
     if (n_sparse_elements > 0) {
-      void* dense_data = const_cast<char*>(dense_data_storage.data());
+      void* dense_data = dense_data_storage.data();
 
       switch (element_size) {
         case 1: {
-          auto dense_data_span = gsl::make_span<uint8_t>(static_cast<uint8_t*>(dense_data), n_dense_elements);
-          status = CopySparseData<uint8_t>(
+          status = CopySparseData(
               n_sparse_elements,
-              indices, dims,
-              [sparse_data, dense_data_span](size_t from_idx, size_t to_idx) {
-                dense_data_span[to_idx] = static_cast<const uint8_t*>(sparse_data)[from_idx];
+              indices, model_path, dims,
+              [sparse_data, dense_data](size_t from_idx, size_t to_idx) {
+                static_cast<uint8_t*>(dense_data)[to_idx] = static_cast<const uint8_t*>(sparse_data)[from_idx];
               });
 
           break;
         }
         case 2: {
-          auto dense_data_span = gsl::make_span<uint16_t>(static_cast<uint16_t*>(dense_data), n_dense_elements);
-          status = CopySparseData<uint16_t>(
+          status = CopySparseData(
               n_sparse_elements,
-              indices, dims,
-              [sparse_data, dense_data_span](size_t from_idx, size_t to_idx) {
-                dense_data_span[to_idx] = static_cast<const uint16_t*>(sparse_data)[from_idx];
+              indices, model_path, dims,
+              [sparse_data, dense_data](size_t from_idx, size_t to_idx) {
+                const auto* src = static_cast<const uint16_t*>(sparse_data) + from_idx;
+                auto* dst = static_cast<uint16_t*>(dense_data) + to_idx;
+                memcpy(dst, src, sizeof(uint16_t));
               });
 
           break;
         }
         case 4: {
-          auto dense_data_span = gsl::make_span<uint32_t>(static_cast<uint32_t*>(dense_data), n_dense_elements);
-          status = CopySparseData<uint32_t>(
+          status = CopySparseData(
               n_sparse_elements,
-              indices, dims,
-              [sparse_data, dense_data_span](size_t from_idx, size_t to_idx) {
-                dense_data_span[to_idx] = static_cast<const uint32_t*>(sparse_data)[from_idx];
+              indices, model_path, dims,
+              [sparse_data, dense_data](size_t from_idx, size_t to_idx) {
+                const auto* src = static_cast<const uint32_t*>(sparse_data) + from_idx;
+                auto* dst = static_cast<uint32_t*>(dense_data) + to_idx;
+                memcpy(dst, src, sizeof(uint32_t));
               });
 
           break;
         }
         case 8: {
-          auto dense_data_span = gsl::make_span<uint64_t>(static_cast<uint64_t*>(dense_data), n_dense_elements);
-          status = CopySparseData<uint64_t>(
+          status = CopySparseData(
               n_sparse_elements,
-              indices, dims,
-              [sparse_data, dense_data_span](size_t from_idx, size_t to_idx) {
-                dense_data_span[to_idx] = static_cast<const uint64_t*>(sparse_data)[from_idx];
+              indices, model_path, dims,
+              [sparse_data, dense_data](size_t from_idx, size_t to_idx) {
+                const auto* src = static_cast<const uint64_t*>(sparse_data) + from_idx;
+                auto* dst = static_cast<uint64_t*>(dense_data) + to_idx;
+                memcpy(dst, src, sizeof(uint64_t));
               });
           break;
         }
 
         default:
           return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
-                                 " BUG! Report to onnxruntime team. element_size of: ",
-                                 element_size, " is not supported.", " type: ", type);
+                                 "Element_size of: ", element_size, " is not supported.", " type: ", type);
       }
 
       ORT_RETURN_IF_ERROR(status);
@@ -1029,7 +1059,8 @@ common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseT
 
   } else {
     // No request for std::string
-    conversion_internal::UnsupportedSparseDataType()(ONNX_NAMESPACE::TensorProto_DataType_STRING, status);
+    status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported sparse tensor data type of ",
+                             ONNX_NAMESPACE::TensorProto_DataType_STRING);
   }
   return status;
 }
@@ -1040,36 +1071,6 @@ using IsZeroFunc = bool (*)(const void*);
 // Copy element
 using CopyElementFunc = void (*)(void* dest, const void* src, int64_t dest_index, int64_t src_index);
 
-static void SparsifyGeneric(const void* dense_raw_data, size_t n_dense_elements, size_t element_size,
-                            IsZeroFunc is_zero, CopyElementFunc copy,
-                            TensorProto& values, TensorProto& indices) {
-  auto advance = [element_size](const void* start, size_t elements) -> const void* {
-    return (reinterpret_cast<const uint8_t*>(start) + elements * element_size);
-  };
-
-  const auto* cbegin = dense_raw_data;
-  const auto* const cend = advance(cbegin, n_dense_elements);
-  auto& indices_data = *indices.mutable_int64_data();
-  int64_t index = 0;
-  while (cbegin != cend) {
-    if (!is_zero(cbegin)) {
-      indices_data.Add(index);
-    }
-    ++index;
-    cbegin = advance(cbegin, 1U);
-  }
-
-  auto& raw_data = *values.mutable_raw_data();
-  raw_data.resize(indices.int64_data_size() * element_size);
-  void* data_dest = const_cast<char*>(raw_data.data());
-
-  int64_t dest_index = 0;
-  for (auto src_index : indices.int64_data()) {
-    copy(data_dest, dense_raw_data, dest_index, src_index);
-    ++dest_index;
-  }
-}
-
 // Here we are not using tolerance for FP types since these dense tensors were
 // created from sparse initializers where zeros were absolute
 template <typename T>
@@ -1079,7 +1080,85 @@ inline bool IsZero(const void* p) {
 
 template <typename T>
 inline void CopyElement(void* dst, const void* src, int64_t dst_index, int64_t src_index) {
-  reinterpret_cast<T*>(dst)[dst_index] = reinterpret_cast<const T*>(src)[src_index];
+  const auto* src_p = reinterpret_cast<const T*>(src) + src_index;
+  auto* dst_p = reinterpret_cast<T*>(dst) + dst_index;
+  memcpy(dst_p, src_p, sizeof(T));
+}
+
+template <>
+inline void CopyElement<uint8_t>(void* dst, const void* src, int64_t dst_index, int64_t src_index) {
+  reinterpret_cast<uint8_t*>(dst)[dst_index] = reinterpret_cast<const uint8_t*>(src)[src_index];
+}
+
+
+template <typename T>
+static void SetIndices(gsl::span<int64_t> gathered_indices,
+                       std::string& raw_indices,
+                       TensorProto& indices) {
+  raw_indices.resize(gathered_indices.size() * sizeof(T));
+  auto* ind_dest = reinterpret_cast<T*>(raw_indices.data());
+  size_t dest_index = 0;
+  for (auto src_index : gathered_indices) {
+    ORT_IF_CONSTEXPR(sizeof(T) == sizeof(int8_t)) {
+      ind_dest[dest_index] = static_cast<T>(src_index);
+    } else {
+      auto* dst = ind_dest + dest_index;
+      T v = static_cast<T>(src_index);
+      memcpy(dst, &v, sizeof(T));
+    }
+    ++dest_index;
+  }
+  indices.set_data_type(utils::ToTensorProtoElementType<T>());
+}
+
+static void SparsifyGeneric(const void* dense_raw_data, size_t n_dense_elements, size_t element_size,
+                            IsZeroFunc is_zero, CopyElementFunc copy,
+                            TensorProto& values, TensorProto& indices,
+                            size_t& nnz) {
+  auto advance = [element_size](const void* start, size_t elements) -> const void* {
+    return (reinterpret_cast<const uint8_t*>(start) + elements * element_size);
+  };
+
+  const auto* cbegin = dense_raw_data;
+  const auto* const cend = advance(cbegin, n_dense_elements);
+  std::vector<int64_t> gathered_indices;
+  int64_t index = 0;
+  while (cbegin != cend) {
+    if (!is_zero(cbegin)) {
+      gathered_indices.push_back(index);
+    }
+    ++index;
+    cbegin = advance(cbegin, 1U);
+  }
+
+  if (!gathered_indices.empty()) {
+    auto& raw_data = *values.mutable_raw_data();
+    raw_data.resize(gathered_indices.size() * element_size);
+    void* data_dest = raw_data.data();
+
+    int64_t dest_index = 0;
+    for (auto src_index : gathered_indices) {
+      copy(data_dest, dense_raw_data, dest_index, src_index);
+      ++dest_index;
+    }
+
+    auto gathered_span = gsl::make_span(gathered_indices);
+    auto& raw_indices = *indices.mutable_raw_data();
+    const auto max_index = gathered_indices.back();
+    if (max_index <= std::numeric_limits<int8_t>::max()) {
+      SetIndices<int8_t>(gathered_span, raw_indices, indices);
+    } else if (max_index <= std::numeric_limits<int16_t>::max()) {
+      SetIndices<int16_t>(gathered_span, raw_indices, indices);
+    } else if (max_index <= std::numeric_limits<int32_t>::max()) {
+      SetIndices<int32_t>(gathered_span, raw_indices, indices);
+    } else {
+      SetIndices<int64_t>(gathered_span, raw_indices, indices);
+    }
+  } else {
+    indices.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT8);
+    indices.set_raw_data(std::string());
+  }
+  nnz = gathered_indices.size();
 }
 
 common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto& dense_proto,
@@ -1087,11 +1166,9 @@ common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto&
                                               ONNX_NAMESPACE::SparseTensorProto& result) {
   ORT_ENFORCE(HasDataType(dense_proto), "Must have a valid data type");
 
-  const bool is_string_data = dense_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING;
-  if (is_string_data) {
-    Status status{};
-    conversion_internal::UnsupportedSparseDataType()(ONNX_NAMESPACE::TensorProto_DataType_STRING, status);
-    return status;
+  if (dense_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported sparse tensor data type of ",
+                           ONNX_NAMESPACE::TensorProto_DataType_STRING);
   }
 
   const auto data_type = dense_proto.data_type();
@@ -1101,51 +1178,47 @@ common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto&
   values.set_data_type(data_type);
 
   auto& indices = *sparse_proto.mutable_indices();
-  indices.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
 
   SafeInt<size_t> n_dense_elements = 1;
   for (auto dim : dense_proto.dims()) {
     n_dense_elements *= dim;
   }
 
+  auto ml_data = DataTypeImpl::TensorTypeFromONNXEnum(data_type)->GetElementType();
+  size_t element_size = ml_data->Size();
+
   std::vector<uint8_t> dense_raw_data;
   ORT_RETURN_IF_ERROR(UnpackInitializerData(dense_proto, model_path, dense_raw_data));
-  size_t element_size = 0;
-  // We want this type list to match the one above in SparseTensorProtoToDenseTensorProto
-  MLTypeCallDispatcherFromTypeList<conversion_internal::SupportedConversionTypeList> type_disp(data_type);
-  ORT_RETURN_IF_ERROR(
-      (type_disp.InvokeRetWithUnsupportedPolicy<Status, conversion_internal::GetElementSize, conversion_internal::UnsupportedSparseDataType>(element_size)));
 
+  size_t nnz = 0;
   void* dense_data = dense_raw_data.data();
   switch (element_size) {
     case 1: {
       SparsifyGeneric(dense_data, n_dense_elements, element_size,
-                      IsZero<uint8_t>, CopyElement<uint8_t>, values, indices);
+                      IsZero<uint8_t>, CopyElement<uint8_t>, values, indices, nnz);
       break;
     }
     case 2: {
       SparsifyGeneric(dense_data, n_dense_elements, element_size,
-                      IsZero<uint16_t>, CopyElement<uint16_t>, values, indices);
+                      IsZero<uint16_t>, CopyElement<uint16_t>, values, indices, nnz);
       break;
     }
     case 4: {
       SparsifyGeneric(dense_data, n_dense_elements, element_size,
-                      IsZero<uint32_t>, CopyElement<uint32_t>, values, indices);
+                      IsZero<uint32_t>, CopyElement<uint32_t>, values, indices, nnz);
       break;
     }
     case 8: {
       SparsifyGeneric(dense_data, n_dense_elements, element_size,
-                      IsZero<uint64_t>, CopyElement<uint64_t>, values, indices);
+                      IsZero<uint64_t>, CopyElement<uint64_t>, values, indices, nnz);
       break;
     }
     default:
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
-                             " BUG! Report to onnxruntime team. element_size of: ",
-                             element_size, " is not supported.", " data_type: ", data_type);
+                             "Element_size of: ", element_size, " is not supported.", " data_type: ", data_type);
   }
 
   // Fix up shapes
-  const auto nnz = indices.int64_data_size();
   values.add_dims(nnz);
   indices.add_dims(nnz);
 
diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc
index 83aa35101f..4e4afd979f 100644
--- a/onnxruntime/test/framework/sparse_kernels_test.cc
+++ b/onnxruntime/test/framework/sparse_kernels_test.cc
@@ -669,8 +669,51 @@ static void CreateTensorWithExternalData(
   tensor_proto.set_data_type(type);
 }
 
+namespace {
+
+void insert_indices_data(bool indices_1D,
+                         size_t values_size, size_t shape_size,
+                         std::vector<int8_t>& indices_data,
+                         TensorProto& indices_tp) {
+  if (indices_1D) {
+    indices_data = {2, 5, 6, 10};
+    indices_tp.add_dims(indices_data.size());
+  } else {
+    // indices are shape {NNZ, rank} so convert flattened values of 2, 5, 6 and 10 to rank 3 values
+    indices_tp.add_dims(values_size);
+    indices_tp.add_dims(shape_size);
+    indices_data = {
+        0, 1, 0,
+        0, 2, 1,
+        1, 0, 0,
+        1, 2, 0};
+  }
+}
+
 template <typename T>
-static NodeProto CreateConstantNode(bool indices_1D,
+struct InsertIndices {
+  void operator()(bool indices_1D, size_t values_size, size_t shape_size, TensorProto& indices_tp) const {
+    static_assert(std::is_integral_v<T>, "indices data must be integral data type");
+    static_assert(std::is_signed_v<T>, "indices must be signed data type");
+    std::vector<int8_t> indices_data;
+    insert_indices_data(indices_1D, values_size, shape_size, indices_data, indices_tp);
+    indices_tp.set_data_type(utils::ToTensorProtoElementType<T>());
+    ORT_IF_CONSTEXPR (sizeof(T) == sizeof(int8_t)) {
+      indices_tp.mutable_raw_data()->assign(reinterpret_cast<const char*>(indices_data.data()), indices_data.size());
+    } else {
+      // Conversion on the fly to the target data type
+      std::vector<T> indices(indices_data.cbegin(), indices_data.cend());
+      indices_tp.mutable_raw_data()->assign(reinterpret_cast<const char*>(indices.data()), indices.size() * sizeof(T));
+    }
+  }
+};
+
+using SupportedIndicesTypeList = onnxruntime::TypeList<int8_t, int16_t, int32_t, int64_t>;
+
+}  // namespace
+
+template <typename T>
+static NodeProto CreateConstantNode(bool indices_1D, int32_t indices_type,
                                     std::function<void(const std::vector<T>& values, TensorProto& tp)> inserter,
                                     std::vector<T>& expected_data) {
   NodeProto constant_node;
@@ -678,7 +721,6 @@ static NodeProto CreateConstantNode(bool indices_1D,
   constant_node.add_output("dense_tensor_output");
 
   std::vector<T> values = CreateValues<T>();
-  std::vector<int64_t> indices;
   std::vector<int64_t> shape{2, 3, 2};
 
   AttributeProto& attrib = *constant_node.mutable_attribute()->Add();
@@ -686,26 +728,11 @@ static NodeProto CreateConstantNode(bool indices_1D,
   attrib.set_type(AttributeProto_AttributeType_SPARSE_TENSOR);
 
   SparseTensorProto& stp = *attrib.mutable_sparse_tensor();
-  TensorProto& indices_tp = *stp.mutable_indices();
-
   stp.mutable_dims()->Add(shape.cbegin(), shape.cend());
 
-  if (indices_1D) {
-    indices = {2, 5, 6, 10};
-    indices_tp.add_dims(indices.size());
-  } else {
-    // indices are shape {NNZ, rank} so convert flattened values of 2, 5, 6 and 10 to rank 3 values
-    indices_tp.add_dims(values.size());
-    indices_tp.add_dims(shape.size());
-    indices = {
-        0, 1, 0,
-        0, 2, 1,
-        1, 0, 0,
-        1, 2, 0};
-  }
-
-  indices_tp.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
-  indices_tp.mutable_int64_data()->Add(indices.cbegin(), indices.cend());
+  TensorProto& indices_tp = *stp.mutable_indices();
+  utils::MLTypeCallDispatcherFromTypeList<SupportedIndicesTypeList> type_disp(indices_type);
+  type_disp.Invoke<InsertIndices>(indices_1D, values.size(), shape.size(), indices_tp);
 
   expected_data.resize(2 * 3 * 2);
   expected_data[2] = values[0];
@@ -733,10 +760,9 @@ static NodeProto CreateConstantNodeAllZeros(bool indices_1D, std::vector<T>& exp
   attrib.set_type(AttributeProto_AttributeType_SPARSE_TENSOR);
 
   SparseTensorProto& stp = *attrib.mutable_sparse_tensor();
-  TensorProto& indices_tp = *stp.mutable_indices();
-
   stp.mutable_dims()->Add(shape.cbegin(), shape.cend());
 
+  TensorProto& indices_tp = *stp.mutable_indices();
   if (indices_1D) {
     indices_tp.add_dims(0);
   } else {
@@ -759,11 +785,11 @@ static NodeProto CreateConstantNodeAllZeros(bool indices_1D, std::vector<T>& exp
 }
 
 template <typename T>
-static void TestConversion(bool use_1D_indices,
+static void TestConversion(bool use_1D_indices, int32_t indices_type,
                            std::function<void(const std::vector<T>& values, TensorProto& tp)> inserter,
                            std::function<void(gsl::span<const T> expected, const TensorProto& actual)> checker) {
   std::vector<T> expected;
-  auto node = CreateConstantNode<T>(use_1D_indices, inserter, expected);
+  auto node = CreateConstantNode<T>(use_1D_indices, indices_type, inserter, expected);
 
   TensorProto dense;
   // Path is required for loading external data (if any)
@@ -793,8 +819,17 @@ template <typename T>
 static void TestConversion(
     std::function<void(const std::vector<T>& values, TensorProto& tp)> inserter,
     std::function<void(gsl::span<const T> expected, const TensorProto& actual)> checker) {
-  TestConversion(true, inserter, checker);
-  TestConversion(false, inserter, checker);
+  std::vector<TensorProto_DataType> indices_types{
+      TensorProto_DataType_INT8,
+      TensorProto_DataType_INT16,
+      TensorProto_DataType_INT32,
+      TensorProto_DataType_INT64
+  };
+
+  for (auto dt : indices_types) {
+    TestConversion(true, dt, inserter, checker);
+    TestConversion(false, dt, inserter, checker);
+  }
   TestConversionAllZeros(true, checker);
   TestConversionAllZeros(false, checker);
 }
@@ -820,7 +855,7 @@ static void RawDataChecker(gsl::span<const T> expected, const TensorProto& actua
   const T* raw_data = reinterpret_cast<const T*>(actual.raw_data().data());
   auto actual_span = gsl::make_span<const T>(raw_data, actual_size);
 
-  EXPECT_THAT(actual_span, testing::ContainerEq(expected));
+  ASSERT_THAT(actual_span, testing::ContainerEq(expected));
 }
 
 template <>
@@ -831,7 +866,7 @@ void RawDataChecker<MLFloat16>(gsl::span<const MLFloat16> expected_bfloat, const
   const uint16_t* raw_data = reinterpret_cast<const uint16_t*>(actual.raw_data().data());
   auto actual_span = gsl::make_span<const uint16_t>(raw_data, actual_size);
 
-  EXPECT_THAT(actual_span, testing::ContainerEq(expected));
+  ASSERT_THAT(actual_span, testing::ContainerEq(expected));
 }
 
 template <>
@@ -842,7 +877,7 @@ void RawDataChecker<BFloat16>(gsl::span<const BFloat16> expected_bfloat, const T
   const uint16_t* raw_data = reinterpret_cast<const uint16_t*>(actual.raw_data().data());
   auto actual_span = gsl::make_span<const uint16_t>(raw_data, actual_size);
 
-  EXPECT_THAT(actual_span, testing::ContainerEq(expected));
+  ASSERT_THAT(actual_span, testing::ContainerEq(expected));
 }
 
 TEST(SparseTensorConversionTests, TestConstantNodeConversion) {
@@ -938,6 +973,7 @@ TEST(SparseTensorConversionTests, TestConstantNodeConversion) {
   PathString tensor_filename(ORT_TSTR("tensor_XXXXXX"));
   TestConversion<float>(
       true,
+      TensorProto_DataType_INT64,
       [&tensor_filename](const std::vector<float>& values, TensorProto& tp) {
         CreateTensorWithExternalData<float>(TensorProto_DataType_FLOAT, values, tensor_filename, tp);
       },
@@ -950,8 +986,11 @@ TEST(SparseTensorConversionTests, TestConstantNodeConversion) {
 #if !defined(ORT_MINIMAL_BUILD)
 
 template <typename T>
-static std::vector<T> CreateSparseValues() {
-  return {0, 2, 3, 0};
+static std::vector<T> CreateSparseValues(size_t indices_start) {
+  std::vector<T> result(indices_start + 2);
+  result[indices_start] = 2;
+  result[indices_start + 1] = 3;
+  return result;
 }
 
 /* std::string support in the future
@@ -962,13 +1001,19 @@ std::vector<std::string> CreateSparseValues<std::string>() {
 */
 
 template <>
-std::vector<BFloat16> CreateSparseValues<BFloat16>() {
-  return {BFloat16(0.f), BFloat16(2.f), BFloat16(3.f), BFloat16(0.f)};
+std::vector<BFloat16> CreateSparseValues<BFloat16>(size_t indices_start) {
+  std::vector<BFloat16> result(indices_start + 2);
+  result[indices_start] = BFloat16(2.f);
+  result[indices_start + 1] = BFloat16(3.f);
+  return result;
 }
 
 template <>
-std::vector<MLFloat16> CreateSparseValues<MLFloat16>() {
-  return {MLFloat16(0.f), MLFloat16(2.f), MLFloat16(3.f), MLFloat16(0.f)};
+std::vector<MLFloat16> CreateSparseValues<MLFloat16>(size_t indices_start) {
+  std::vector<MLFloat16> result(indices_start + 2);
+  result[indices_start] = MLFloat16(2.f);
+  result[indices_start + 1] = MLFloat16(3.f);
+  return result;
 }
 
 template <typename T>
@@ -987,11 +1032,13 @@ std::vector<MLFloat16> CreateSparseValuesAllZeros<MLFloat16>() {
 }
 
 template <typename T>
-TensorProto CreateDenseTensor(std::function<void(const std::vector<T>& values, TensorProto& tp)> inserter,
+TensorProto CreateDenseTensor(size_t indices_start,
+                              std::function<void(const std::vector<T>& values, TensorProto& tp)> inserter,
                               std::vector<T>& expected_values, std::vector<int64_t>& expected_indicies) {
   TensorProto result;
-  std::vector<T> values = CreateSparseValues<T>();
-  expected_indicies = {1, 2};
+  std::vector<T> values = CreateSparseValues<T>(indices_start);
+  auto ind_start = static_cast<int64_t>(indices_start); 
+  expected_indicies = {ind_start, ind_start + 1};
   for (const auto& ind : expected_indicies) {
     expected_values.push_back(values[ind]);
   }
@@ -1026,12 +1073,9 @@ static void RawSparseDataChecker(gsl::span<const T> expected_values,
   const T* raw_data = reinterpret_cast<const T*>(actual.values().raw_data().data());
   auto actual_span = gsl::make_span<const T>(raw_data, actual_size);
 
-  EXPECT_THAT(actual_span, testing::ContainerEq(expected_values));
+  ASSERT_THAT(actual_span, testing::ContainerEq(expected_values));
 
-  // Check indicies
-  EXPECT_THAT(actual.indices().data_type(), ONNX_NAMESPACE::TensorProto_DataType_INT64);
-  auto actual_indicies = gsl::make_span<const int64_t>(actual.indices().int64_data().data(), actual.indices().int64_data_size());
-  EXPECT_THAT(actual_indicies, testing::ContainerEq(expected_indicies));
+  SparseIndicesChecker(actual.indices(), expected_indicies);
 }
 
 template <>
@@ -1045,11 +1089,8 @@ void RawSparseDataChecker<BFloat16>(gsl::span<const BFloat16> expected_bfloat,
   const uint16_t* raw_data = reinterpret_cast<const uint16_t*>(actual.values().raw_data().data());
   auto actual_span = gsl::make_span<const uint16_t>(raw_data, actual_size);
 
-  EXPECT_THAT(actual_span, testing::ContainerEq(expected));
-  // Check indicies
-  EXPECT_THAT(actual.indices().data_type(), ONNX_NAMESPACE::TensorProto_DataType_INT64);
-  auto actual_indicies = gsl::make_span<const int64_t>(actual.indices().int64_data().data(), actual.indices().int64_data_size());
-  EXPECT_THAT(actual_indicies, testing::ContainerEq(expected_indicies));
+  ASSERT_THAT(actual_span, testing::ContainerEq(expected));
+  SparseIndicesChecker(actual.indices(), expected_indicies);
 }
 
 template <>
@@ -1063,15 +1104,12 @@ void RawSparseDataChecker<MLFloat16>(gsl::span<const MLFloat16> expected_bfloat,
   const uint16_t* raw_data = reinterpret_cast<const uint16_t*>(actual.values().raw_data().data());
   auto actual_span = gsl::make_span<const uint16_t>(raw_data, actual_size);
 
-  EXPECT_THAT(actual_span, testing::ContainerEq(expected));
-  // Check indicies
-  EXPECT_THAT(actual.indices().data_type(), ONNX_NAMESPACE::TensorProto_DataType_INT64);
-  auto actual_indicies = gsl::make_span<const int64_t>(actual.indices().int64_data().data(), actual.indices().int64_data_size());
-  EXPECT_THAT(actual_indicies, testing::ContainerEq(expected_indicies));
+  ASSERT_THAT(actual_span, testing::ContainerEq(expected));
+  SparseIndicesChecker(actual.indices(), expected_indicies);
 }
 
 template <typename T>
-static void TestDenseToSparseConversionValues(
+static void TestDenseToSparseConversionValues(size_t indices_start,
     std::function<void(const std::vector<T>& values, TensorProto& tp)> inserter,
     std::function<void(gsl::span<const T> expected,
                        gsl::span<const int64_t> expected_indicies,
@@ -1082,7 +1120,7 @@ static void TestDenseToSparseConversionValues(
   // Path is required for loading external data
   // Using empty path here since the data is not external
   Path model_path;
-  TensorProto dense_tensor = CreateDenseTensor(inserter, expected_values, expected_indicies);
+  TensorProto dense_tensor = CreateDenseTensor(indices_start, inserter, expected_values, expected_indicies);
 
   SparseTensorProto sparse_tensor;
   utils::DenseTensorToSparseTensorProto(dense_tensor, model_path, sparse_tensor);
@@ -1117,17 +1155,21 @@ static void TestDenseAllZerosToSparseConversion(
 }
 
 template <typename T>
-static void TestDenseToSparseConversion(std::function<void(const std::vector<T>& values, TensorProto& tp)> inserter,
+static void TestDenseToSparseConversion(size_t indices_start,
+                                        std::function<void(const std::vector<T>& values, TensorProto& tp)> inserter,
                                         std::function<void(gsl::span<const T> expected,
                                                            gsl::span<const int64_t> expected_indicies,
                                                            const SparseTensorProto& actual)>
                                             checker) {
-  TestDenseToSparseConversionValues<T>(inserter, checker);
+  TestDenseToSparseConversionValues<T>(indices_start, inserter, checker);
   TestDenseAllZerosToSparseConversion<T>(inserter, checker);
 }
 
 TEST(SparseTensorConversionTests, TestDenseToSparseConversion) {
+  // This one will test indices that are less than max int8 value
+  // which should result in int8 indices
   TestDenseToSparseConversion<float>(
+      20U,
       [](const std::vector<float>& values, TensorProto& tp) {
         tp.set_data_type(TensorProto_DataType_FLOAT);
         tp.set_name("dense_float");
@@ -1135,7 +1177,10 @@ TEST(SparseTensorConversionTests, TestDenseToSparseConversion) {
       },
       RawSparseDataChecker<float>);
 
+  // This one will test indices that are max(int8) < ind < max(int16) value
+  // which should result in int16 indices
   TestDenseToSparseConversion<double>(
+      static_cast<size_t>(std::numeric_limits<int8_t>::max()) + 20U,
       [](const std::vector<double>& values, TensorProto& tp) {
         tp.set_data_type(TensorProto_DataType_DOUBLE);
         tp.set_name("dense_double");
@@ -1143,7 +1188,10 @@ TEST(SparseTensorConversionTests, TestDenseToSparseConversion) {
       },
       RawSparseDataChecker<double>);
 
+  // This one will test indices that are max(int16) < ind < max(int32) value
+  // which should result in int32 indices
   TestDenseToSparseConversion<BFloat16>(
+      static_cast<size_t>(std::numeric_limits<int16_t>::max()) + 20U,
       [](const std::vector<BFloat16>& values, TensorProto& tp) {
         tp.set_data_type(TensorProto_DataType_BFLOAT16);
         tp.set_name("dense_bfloat16");
@@ -1153,7 +1201,11 @@ TEST(SparseTensorConversionTests, TestDenseToSparseConversion) {
       },
       RawSparseDataChecker<BFloat16>);
 
+  // Protobuf can not hold anything more than 2Gb and it overflows. Can't test 64-bit indices
+  // on conversion unless explicitly created.
+  // which should result in int32 indices
   TestDenseToSparseConversion<MLFloat16>(
+      20U,
       [](const std::vector<MLFloat16>& values, TensorProto& tp) {
         tp.set_data_type(TensorProto_DataType_FLOAT16);
         tp.set_name("dense_float16");
@@ -1164,6 +1216,7 @@ TEST(SparseTensorConversionTests, TestDenseToSparseConversion) {
       RawSparseDataChecker<MLFloat16>);
 
   TestDenseToSparseConversion<int16_t>(
+      20U,
       [](const std::vector<int16_t>& values, TensorProto& tp) {
         tp.set_name("dense_int16");
         tp.set_data_type(TensorProto_DataType_INT16);
@@ -1172,6 +1225,7 @@ TEST(SparseTensorConversionTests, TestDenseToSparseConversion) {
       RawSparseDataChecker<int16_t>);
 
   TestDenseToSparseConversion<uint16_t>(
+      20U,
       [](const std::vector<uint16_t>& values, TensorProto& tp) {
         tp.set_name("dense_uint16");
         tp.set_data_type(TensorProto_DataType_UINT16);
@@ -1180,6 +1234,7 @@ TEST(SparseTensorConversionTests, TestDenseToSparseConversion) {
       RawSparseDataChecker<uint16_t>);
 
   TestDenseToSparseConversion<int32_t>(
+      20U,
       [](const std::vector<int32_t>& values, TensorProto& tp) {
         tp.set_name("dense_int32");
         tp.set_data_type(TensorProto_DataType_INT32);
@@ -1188,6 +1243,7 @@ TEST(SparseTensorConversionTests, TestDenseToSparseConversion) {
       RawSparseDataChecker<int32_t>);
 
   TestDenseToSparseConversion<uint32_t>(
+      20U,
       [](const std::vector<uint32_t>& values, TensorProto& tp) {
         tp.set_name("dense_uint32");
         tp.set_data_type(TensorProto_DataType_UINT32);
@@ -1196,6 +1252,7 @@ TEST(SparseTensorConversionTests, TestDenseToSparseConversion) {
       RawSparseDataChecker<uint32_t>);
 
   TestDenseToSparseConversion<int64_t>(
+      20U,
       [](const std::vector<int64_t>& values, TensorProto& tp) {
         tp.set_name("dense_int64");
         tp.set_data_type(TensorProto_DataType_INT64);
@@ -1204,6 +1261,7 @@ TEST(SparseTensorConversionTests, TestDenseToSparseConversion) {
       RawSparseDataChecker<int64_t>);
 
   TestDenseToSparseConversion<uint64_t>(
+      20U,
       [](const std::vector<uint64_t>& values, TensorProto& tp) {
         tp.set_name("dense_uint64");
         tp.set_data_type(TensorProto_DataType_UINT64);
@@ -1212,6 +1270,7 @@ TEST(SparseTensorConversionTests, TestDenseToSparseConversion) {
       RawSparseDataChecker<uint64_t>);
 
   TestDenseToSparseConversion<int8_t>(
+      20U,
       [](const std::vector<int8_t>& values, TensorProto& tp) {
         tp.set_name("dense_int8");
         tp.set_data_type(TensorProto_DataType_INT8);
@@ -1220,6 +1279,7 @@ TEST(SparseTensorConversionTests, TestDenseToSparseConversion) {
       RawSparseDataChecker<int8_t>);
 
   TestDenseToSparseConversion<uint8_t>(
+      20U,
       [](const std::vector<uint8_t>& values, TensorProto& tp) {
         tp.set_name("dense_int64");
         RawDataWriter(values, tp, TensorProto_DataType_UINT8);
diff --git a/onnxruntime/test/framework/test_utils.h b/onnxruntime/test/framework/test_utils.h
index 9d5633ea34..c206492961 100644
--- a/onnxruntime/test/framework/test_utils.h
+++ b/onnxruntime/test/framework/test_utils.h
@@ -97,5 +97,9 @@ void AllocateMLValue(AllocatorPtr alloc, const std::vector<int64_t>& dims, OrtVa
 // Helper function to check that the graph transformations have been successfully applied.
 std::map<std::string, int> CountOpsInGraph(const Graph& graph, bool recurse_into_subgraphs = true);
 
+#if !defined(DISABLE_SPARSE_TENSORS)
+void SparseIndicesChecker(const ONNX_NAMESPACE::TensorProto& indices_proto, gsl::span<const int64_t> expected_indicies);
+#endif // DISABLE_SPARSE_TENSORS
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/ir/graph_test.cc b/onnxruntime/test/ir/graph_test.cc
index 2c2fc774cf..8bdce400fd 100644
--- a/onnxruntime/test/ir/graph_test.cc
+++ b/onnxruntime/test/ir/graph_test.cc
@@ -10,6 +10,7 @@
 #include "gmock/gmock.h"
 #include "onnx/defs/function.h"
 #include "core/graph/function_impl.h"
+#include "test/framework/test_utils.h"
 
 #ifdef __GNUC__
 #define UNUSED __attribute__((unused))
@@ -233,6 +234,7 @@ static void ConstructSparseTensor(const std::string& name,
   std::copy(values.cbegin(), values.cend(), dest_span.begin());
 
   const std::vector<int64_t>& indices = sparse_details::indices;  // Not to exceed 59
+
   auto& m_indicies = *sparse_proto.mutable_indices();
   m_indicies.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
   *m_indicies.mutable_dims()->Add() = static_cast<int64_t>(indices.size());
@@ -264,10 +266,9 @@ static void ValidateSparseTensorProto(const SparseTensorProto& proto) {
     ++expected_begin;
   }
   // Check indices
-  EXPECT_EQ(proto.indices().data_type(), ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  const auto& indices = proto.indices();
   auto expected_indices = gsl::make_span(sparse_details::indices);
-  auto actual_indices = gsl::make_span<const int64_t>(proto.indices().int64_data().data(), proto.indices().int64_data_size());
-  EXPECT_THAT(actual_indices, testing::ContainerEq(expected_indices));
+  SparseIndicesChecker(indices, expected_indices);
   // check shape
   const auto& dims = proto.dims();
   auto actual_shape = gsl::make_span<const int64_t>(dims.data(), dims.size());
diff --git a/onnxruntime/test/util/test_utils.cc b/onnxruntime/test/util/test_utils.cc
index 152e44bec3..a413adb733 100644
--- a/onnxruntime/test/util/test_utils.cc
+++ b/onnxruntime/test/util/test_utils.cc
@@ -4,7 +4,9 @@
 #include "test/util/include/test_utils.h"
 
 #include "core/framework/ort_value.h"
+#include "core/graph/onnx_protobuf.h"
 #include "core/session/inference_session.h"
+#include "core/framework/tensorprotoutils.h"
 
 #include "test/util/include/asserts.h"
 #include "test/util/include/test/test_environment.h"
@@ -115,5 +117,68 @@ void RunAndVerifyOutputsWithEP(const ORTCHAR_T* model_path, const char* log_id,
   VerifyOutputs(output_names, expected_fetches, fetches);
 }
 
+#if !defined(DISABLE_SPARSE_TENSORS)
+void SparseIndicesChecker(const ONNX_NAMESPACE::TensorProto& indices_proto, gsl::span<const int64_t> expected_indicies) {
+  using namespace ONNX_NAMESPACE;
+  Path model_path;
+  std::vector<uint8_t> unpack_buffer;
+  gsl::span<const int64_t> ind_span;
+  std::vector<int64_t> converted_indices;
+  TensorShape ind_shape(indices_proto.dims().data(), indices_proto.dims().size());
+  const auto elements = gsl::narrow<size_t>(ind_shape.Size());
+  const bool has_raw_data = indices_proto.has_raw_data();
+  switch (indices_proto.data_type()) {
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64: {
+      if (has_raw_data) {
+        const auto& rd = indices_proto.raw_data();
+        ASSERT_EQ(rd.size(), elements * sizeof(int64_t));
+        ASSERT_STATUS_OK(utils::UnpackInitializerData(indices_proto, model_path, unpack_buffer));
+        ind_span = gsl::make_span(unpack_buffer).as_span<const int64_t>();
+      } else {
+        ind_span = gsl::make_span(indices_proto.int64_data().cbegin(), indices_proto.int64_data().cend());
+      }
+      break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
+      if (has_raw_data) {
+        const auto& rd = indices_proto.raw_data();
+        ASSERT_EQ(rd.size(), elements * sizeof(int32_t));
+        ASSERT_STATUS_OK(utils::UnpackInitializerData(indices_proto, model_path, unpack_buffer));
+        auto int32_span = gsl::make_span(unpack_buffer).as_span<const int32_t>();
+        converted_indices.insert(converted_indices.cend(), int32_span.cbegin(), int32_span.cend());
+      } else {
+        converted_indices.insert(converted_indices.cend(), indices_proto.int32_data().cbegin(), indices_proto.int32_data().cend());
+      }
+      ind_span = gsl::make_span(converted_indices);
+      break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_INT16: {
+      ASSERT_TRUE(has_raw_data);
+      const auto& rd = indices_proto.raw_data();
+      ASSERT_EQ(rd.size(), elements * sizeof(int16_t));
+      ASSERT_STATUS_OK(utils::UnpackInitializerData(indices_proto, model_path, unpack_buffer));
+      auto int16_span = gsl::make_span(unpack_buffer).as_span<const int16_t>();
+      converted_indices.insert(converted_indices.cend(), int16_span.cbegin(), int16_span.cend());
+      ind_span = gsl::make_span(converted_indices);
+      break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8: {
+      ASSERT_TRUE(has_raw_data);
+      const auto& rd = indices_proto.raw_data();
+      ASSERT_EQ(rd.size(), elements);
+      ASSERT_STATUS_OK(utils::UnpackInitializerData(indices_proto, model_path, unpack_buffer));
+      auto int8_span = gsl::make_span(unpack_buffer).as_span<const int8_t>();
+      converted_indices.insert(converted_indices.cend(), int8_span.cbegin(), int8_span.cend());
+      ind_span = gsl::make_span(converted_indices);
+      break;
+    }
+    default:
+      ASSERT_TRUE(false);
+  }
+  ASSERT_THAT(ind_span, testing::ContainerEq(expected_indicies));
+}
+
+#endif // DISABLE_SPARSE_TENSORS
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/tools/python/sparsify_initializers.py b/tools/python/sparsify_initializers.py
index c81d25c78a..df461f1a92 100644
--- a/tools/python/sparsify_initializers.py
+++ b/tools/python/sparsify_initializers.py
@@ -26,7 +26,7 @@ def parse_arguments():
     parser.add_argument('--exclude', required=False, type=str,
                         help='semicolon separated list of initializer names to exclude')
     parser.add_argument('--tolerance', required=False, type=float, default=1e-6,
-                        help='FP absolute tolerance. If not given simple compare to 0')
+                        help='FP absolute tolerance.')
     parser.add_argument('--sparsity_threshold', required=False,
                         type=float, default=0.5,
                         help='convert to sparse initializers if sparsity is at least this much')
@@ -49,11 +49,13 @@ def setup_logging(verbose):  # type: (bool)  -> None
     logger.setLevel(logging_level)
 
 
-def convert_tensor_to_sparse(tensor, tolerance):  # type: (TensorProto) -> Tuple[SparseTensorProto, float]
+def convert_tensor_to_sparse(tensor,
+                             sparsity_threshold,
+                             tolerance):  # type: (TensorProto, float, float) -> Tuple[SparseTensorProto, float]
     """ returns a tuple of sparse_tensor and sparsity level
     """
     values = []
-    indicies = []
+    indices = []
     nnz_count = 0
     tensor_data = numpy_helper.to_array(tensor).flatten()
     data_len = len(tensor_data)
@@ -62,25 +64,76 @@ def convert_tensor_to_sparse(tensor, tolerance):  # type: (TensorProto) -> Tuple
             el = tensor_data[index]
             if abs(el) <= tolerance:
                 values.append(el)
-                indicies.append(index)
+                indices.append(index)
                 nnz_count += 1
     else:
         for index in range(data_len):
             el = tensor_data[index]
             if el != 0:
                 values.append(el)
-                indicies.append(index)
+                indices.append(index)
                 nnz_count += 1
 
     sparsity = float(1.) - float(nnz_count)/data_len
-    logger.debug(f"initializer={tensor.name}, dtype={tensor_data.dtype}, \
-                 len={data_len}, nnz={nnz_count}, sparsity={sparsity}")
 
-    values_tensor = onnx.helper.make_tensor(tensor.name, tensor.data_type,
-                                            [len(values)], np.array(values).astype(tensor_data.dtype))
+    ind_data_type = TensorProto.INT8
+    ind_dtype = np.int8
+    ind_len = len(indices)
+    max_indices_value = 0
+    if ind_len > 0:
+        max_indices_value = indices[-1]
+        if max_indices_value <= np.iinfo(np.int8).max:
+            ind_data_type = TensorProto.INT8
+            ind_dtype = np.int8
+        elif max_indices_value <= np.iinfo(np.int16).max:
+            ind_data_type = TensorProto.INT16
+            ind_dtype = np.int16
+        elif max_indices_value <= np.iinfo(np.int32).max:
+            ind_data_type = TensorProto.INT32
+            ind_dtype = np.int32
+        else:
+            ind_data_type = TensorProto.INT64
+            ind_dtype = np.int64
+
+    logger.debug(f"initializer={tensor.name}, dtype={tensor_data.dtype}, \
+                 data_len={data_len}, nnz={nnz_count}, sparsity={sparsity}, \
+                 max_indices_value={max_indices_value}, sparse_indices_type={ind_dtype}")
+
+    if sparsity < sparsity_threshold:
+        return (object(), sparsity)
+
+    tensor_data_bytes = tensor_data.nbytes
+    # create np array and cast data to the appropriate type
+    np_values = np.array(values).astype(tensor_data.dtype)
+    # create np array and cast data to the inferred index type
+    np_indices = np.array(indices).astype(ind_dtype)
+    total_sparse_bytes = np_values.nbytes + np_indices.nbytes
+
+    logger.debug(f"initializer={tensor.name}, initializer_bytes={tensor_data_bytes}, \
+                sparse_initializer_bytes={total_sparse_bytes}")
+
+    # This check is usually useful for sparsity_threshold=0.5 where much
+    # depends on the size of the indices entries and the size of the original tensor.
+    # Big dense tensors command larger indices data type and for large float32 tensors
+    # int32 indices are often selected, thus we really want to guard against loosing
+    # rather than winning.
+    if tensor_data_bytes <= total_sparse_bytes:
+        sparsity = float(1.) - float(tensor_data_bytes)/total_sparse_bytes
+        logger.debug(f"initializer={tensor.name}, adjusted_sparsity={sparsity}")
+        return (object(), sparsity)
+
+    values_tensor = onnx.helper.make_tensor(tensor.name,
+                                            tensor.data_type,
+                                            [len(values)],
+                                            np_values.tobytes(),
+                                            raw=True)
+
     indicies_tensor = onnx.helper.make_tensor(tensor.name + '_indicies',
-                                              TensorProto.INT64,
-                                              [len(indicies)], np.array(indicies).astype(np.int64))
+                                              ind_data_type,
+                                              [ind_len],
+                                              np_indices.tobytes(),
+                                              raw=True)
+
     sparse_tensor = onnx.helper.make_sparse_tensor(values_tensor, indicies_tensor, tensor.dims)
     return (sparse_tensor, sparsity)
 
@@ -88,7 +141,7 @@ def convert_tensor_to_sparse(tensor, tolerance):  # type: (TensorProto) -> Tuple
 def convert_initializers(model,
                          exclude_names,
                          sparsity_threshold,
-                         tolerance):  # type: (ModelProto, List[str], float) -> None
+                         tolerance):  # type: (ModelProto, List[str], float, float) -> None
     graph = model.graph
     converted_sparse = []
     remaining_initializers = []
@@ -100,7 +153,7 @@ def convert_initializers(model,
             logger.info(f"initializer={initializer.name} contains bool, not converted")
             remaining_initializers.append(initializer)
             continue
-        sparse_tensor, sparsity = convert_tensor_to_sparse(initializer, tolerance)
+        sparse_tensor, sparsity = convert_tensor_to_sparse(initializer, sparsity_threshold, tolerance)
         if sparsity >= sparsity_threshold:
             logger.info(f"initializer={initializer.name} converted. sparsity={sparsity}")
             converted_sparse.append(sparse_tensor)